diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..0086358db1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true diff --git a/.github/ISSUE_TEMPLATE/issue_report.yml b/.github/ISSUE_TEMPLATE/issue_report.yml new file mode 100644 index 0000000000..ef6e6faa1b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_report.yml @@ -0,0 +1,221 @@ +name: Issue Report +description: File a report for ROCm related issues on Linux and Windows. For issues pertaining to documentation or non-bug related, please open a blank issue located below. +title: "[Issue]: " + +body: +- type: markdown + attributes: + value: | + Thank you for taking the time to fill out this report! + + You can acquire your OS, CPU, GPU (for filling out this report) with the following commands: + + Linux: + echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)"; + echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique; + echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)"; + + Windows: + (Get-WmiObject Win32_OperatingSystem).Version + (Get-WmiObject win32_Processor).Name + (Get-WmiObject win32_VideoController).Name +- type: textarea + attributes: + label: Problem Description + description: Describe the issue you encountered. + validations: + required: true +- type: input + attributes: + label: Operating System + description: What is the name and version number of the OS? + placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)" + validations: + required: true +- type: input + attributes: + label: CPU + description: What CPU did you encounter the issue on? + placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics" + validations: + required: true +- type: dropdown + attributes: + label: GPU + description: What GPU(s) did you encounter the issue on (you can select multiple GPUs from the list) + multiple: true + options: + - AMD Instinct MI300X + - AMD Instinct MI300A + - AMD Instinct MI300 + - AMD Instinct MI250X + - AMD Instinct MI250 + - AMD Instinct MI210 + - AMD Instinct MI100 + - AMD Instinct MI50 + - AMD Instinct MI25 + - AMD Radeon Pro V620 + - AMD Radeon Pro VII + - AMD Radeon RX 7900 XTX + - AMD Radeon VII + - AMD Radeon Pro W7900 + - AMD Radeon Pro W7800 + - AMD Radeon Pro W6800 + - AMD Radeon Pro W6600 + - AMD Radeon Pro W5500 + - AMD Radeon RX 7900 XT + - AMD Radeon RX 7600 + - AMD Radeon RX 6950 XT + - AMD Radeon RX 6900 XT + - AMD Radeon RX 6800 XT + - AMD Radeon RX 6800 + - AMD Radeon RX 6750 + - AMD Radeon RX 6700 XT + - AMD Radeon RX 6700 + - AMD Radeon RX 6650 XT + - AMD Radeon RX 6600 XT + - AMD Radeon RX 6600 + - Other + validations: + required: true +- type: input + attributes: + label: Other + description: If you selected Other, please specify +- type: dropdown + attributes: + label: ROCm Version + description: What version(s) of ROCm did you encounter the issue on? + multiple: true + options: + - ROCm 6.0.0 + - ROCm 5.7.1 + - ROCm 5.7.0 + - ROCm 5.6.1 + - ROCm 5.6.0 + - ROCm 5.5.1 + - ROCm 5.5.0 + validations: + required: true +- type: dropdown + attributes: + label: ROCm Component + description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here. + multiple: true + options: + - Other + - AMD Common Language Runtime + - AMD MIGraphX + - AMD System Management Interface + - amdgpu KCL/autoconf + - amdgpu Kernel-mode GPU Driver + - amdgpu-install + - AOMP + - AOMP Extras + - AqlProfile + - build-infra + - chelsio + - clang-ocl + - Composable Kernel + - dkms + - docker / ROCm-docker + - flang + - gpuburn + - half + - HIP + - HIP Examples + - hipBLAS + - hipBLASLt + - HIPCC + - hipCUB + - hip-examples-private + - hipFFT + - hipfort + - HIPIFY + - hipRAND + - hipSOLVER + - hipSPARSE + - hipSPARSELt + - hipTensor + - hip-tests + - HSA Runtime + - infrastructure + - jenkins-utils + - libdrm + - Linux BPI packaging framework + - llvm-project + - Mesa + - meta + - MIOpen + - MIVisionX + - ml-framework-ci + - MLSEQA_TestRepo + - OpenCL API C++ Bindings + - OpenCL API Headers + - OpenCL Conformance Test Suite + - OpenCL ICD Loader + - perftest-p2p + - prototype + - RCCL + - rccl-rdma-sharp-plugins + - rocALUTION + - rocBLAS + - ROCdbgapi + - ROCdebug-agent + - rocFFT + - ROCgdb + - ROCK + - ROCm Documentation/Website + - ROCm Data Center Tool + - ROCm Examples + - ROCm for Windows + - ROCm Performance Primitives + - ROCm System Management Interface Library + - ROCm Thrust + - ROCm Validation Suite + - rocm_bandwidth_test + - rocm-cmake + - rocm-core + - rocm-docs-core + - rocminfo + - rocMLIR + - rocmtools + - rocPRIM + - rocprofiler + - rocRAND + - ROCR-Runtime + - rocSOLVER + - rocSPARSE + - roctracer + - ROCT-Thunk-Interface + - rocWMMA + - Tensile + - umr + - ibv_rc_pingpong-amd + - mellanox + - mpitest + - Pytorch + - Tensorflow + - APEX + - torchvision + - Magma +- type: textarea + attributes: + label: Steps to Reproduce + description: (Optional) Detailed steps to reproduce the issue. + validations: + required: false + +- type: textarea + attributes: + label: (Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support + description: The output of rocminfo --support could help to better address the problem. + validations: + required: false + +- type: textarea + attributes: + label: Additional Information + description: (Optional) Any additional information that is relevant, e.g. relevant environment variables, dockerfiles, log files, dmesg output (on Linux), etc. + validations: + required: false diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 276690bd4f..0e0a252eb6 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,3 +10,9 @@ updates: open-pull-requests-limit: 10 schedule: interval: "daily" + labels: + - "documentation" + - "dependencies" + - "ci:docs-only" + reviewers: + - "samjwu" diff --git a/.gitignore b/.gitignore index 7af066c82d..340f11cbd2 100644 --- a/.gitignore +++ b/.gitignore @@ -54,5 +54,4 @@ _images/ _static/ _templates/ _toc.yml -docBin/ _doxygen/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 5f50df2525..9e6678abe5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -3,11 +3,6 @@ version: 2 -build: - os: ubuntu-22.04 - tools: - python: "3.8" - sphinx: configuration: docs/conf.py @@ -16,3 +11,8 @@ formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt + +build: + os: ubuntu-22.04 + tools: + python: "3.8" diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e46a4ab4b..2891b8585b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) +- Introduce wrapper sublibrary (limited functionality). (#1071, #1098) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) diff --git a/CITATION.cff b/CITATION.cff index d35fe9e587..3813d63812 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -59,9 +59,9 @@ authors: family-names: Zhou - given-names: Jianfeng family-names: Yan -repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel' +repository-code: 'https://github.com/ROCm/composable_kernel' abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++. keywords: - 'CK, Composable Kernel, Tensor Coordinate Transformation' license: MIT -license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE +license-url: https://github.com/ROCm/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE diff --git a/CMakeLists.txt b/CMakeLists.txt index e780c15657..4e4b9d8d4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,33 @@ if(${hip_VERSION_FLAT} GREATER 500723302) add_compile_options(-fno-offload-uniform-block) endif() +# +# Seperate linking jobs from compiling +# Too many concurrent linking jobs can break the build +# Copied from LLVM +set(CK_PARALLEL_LINK_JOBS "" CACHE STRING + "Define the maximum number of concurrent link jobs (Ninja only).") +if(CMAKE_GENERATOR MATCHES "Ninja") + if(CK_PARALLEL_LINK_JOBS) + set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${CK_PARALLEL_LINK_JOBS}) + set(CMAKE_JOB_POOL_LINK link_job_pool) + endif() +elseif(CK_PARALLEL_LINK_JOBS) + message(WARNING "Job pooling is only available with Ninja generators.") +endif() +# Similar for compiling +set(CK_PARALLEL_COMPILE_JOBS "" CACHE STRING + "Define the maximum number of concurrent compile jobs (Ninja only).") +if(CMAKE_GENERATOR MATCHES "Ninja") + if(CK_PARALLEL_COMPILE_JOBS) + set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${CK_PARALLEL_COMPILE_JOBS}) + set(CMAKE_JOB_POOL_COMPILE compile_job_pool) + endif() +elseif(CK_PARALLEL_COMPILE_JOBS) + message(WARNING "Job pooling is only available with Ninja generators.") +endif() + + option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) diff --git a/Dockerfile b/Dockerfile index 87b4eb8e2b..b9339ec5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=5.7 +ARG ROCMVERSION=6.0 ARG compiler_version="" ARG compiler_commit="" @@ -16,8 +16,8 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb +RUN wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ diff --git a/Jenkinsfile b/Jenkinsfile index 8e67f9cc39..2bb48b85ce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -33,7 +33,7 @@ def runShell(String command){ def getDockerImageName(){ def img - if (params.ROCMVERSION != "6.0"){ + if (params.ROCMVERSION != "6.1"){ if (params.COMPILER_VERSION == "") { img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" } @@ -302,7 +302,7 @@ def buildHipClangJob(Map conf=[:]){ def retimage (retimage, image) = getDockerImage(conf) - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { timeout(time: 5, unit: 'HOURS') { @@ -355,7 +355,7 @@ def runCKProfiler(Map conf=[:]){ def variant = env.STAGE_NAME def retimage - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { @@ -487,7 +487,7 @@ def Build_CK(Map conf=[:]){ def retimage def navi_node = 0 - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { @@ -553,7 +553,7 @@ def Build_CK(Map conf=[:]){ sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip rm -rf hipTensor-"${params.hipTensor_branch}" - wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip + wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip unzip -o "${params.hipTensor_branch}".zip """ dir("hipTensor-${params.hipTensor_branch}"){ @@ -605,7 +605,7 @@ def process_results(Map conf=[:]){ def variant = env.STAGE_NAME def retimage - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) } @@ -655,8 +655,8 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION= - 0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT= +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.0;COMPILER_VERSION= + 0 21 * * * % ROCMVERSION=6.0;COMPILER_VERSION=;COMPILER_COMMIT= 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : "" @@ -675,8 +675,8 @@ pipeline { description: "Force building docker image (default: false), set to true if docker image needs to be updated.") string( name: 'ROCMVERSION', - defaultValue: '5.7', - description: 'Specify which ROCM version to use: 5.7 (default).') + defaultValue: '6.0', + description: 'Specify which ROCM version to use: 6.0 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', @@ -703,8 +703,8 @@ pipeline { description: "Use the CK build to verify hipTensor build and tests (default: ON)") string( name: 'hipTensor_branch', - defaultValue: 'develop', - description: 'Specify which branch of hipTensor to use (default: develop)') + defaultValue: 'mainline', + description: 'Specify which branch of hipTensor to use (default: mainline)') booleanParam( name: "USE_SCCACHE", defaultValue: true, @@ -768,8 +768,15 @@ pipeline { } agent{ label rocmnode("gfx908 || gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ + -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " \ + -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -784,8 +791,12 @@ pipeline { } agent{ label rocmnode("gfx908 || gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx908;gfx90a" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -800,8 +811,12 @@ pipeline { } agent{ label rocmnode("navi21") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx1030" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -816,8 +831,12 @@ pipeline { } agent{ label rocmnode("navi32") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx1101" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') diff --git a/README.md b/README.md index e5a20f143f..4889914691 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,6 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. -page. ```note If you use CK, cite us as follows: @@ -71,7 +70,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa 3. Clone CK source code from the GitHub repository and start the build: ```bash - git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \ + git clone https://github.com/ROCm/composable_kernel.git && \ cd composable_kernel && \ mkdir build && \ cd build diff --git a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc index c72c72971d..e8f5529520 100644 --- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc +++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc @@ -16,6 +16,7 @@ using InLayout = ck::tensor_layout::convolution::NDHWGC; using WeiLayout = ck::tensor_layout::convolution::GKZYXC; using OutLayout = ck::tensor_layout::convolution::NDHWGK; +using BiasLayout = ck::tensor_layout::convolution::G_K; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; @@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() std::array out_lengths{G, N, K, Do, Ho, Wo}; std::array out_strides{ K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; + // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW) + std::array bias_lengths{G, 1, K, 1, 1, 1}; + std::array bias_strides{K, 0, 1, 0, 0, 0}; std::array filter_strides{1, 1, 1}; std::array filter_dilations{1, 1, 1}; @@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K); SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K); - SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K); + SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K); using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< NumDimSpatial, InLayout, WeiLayout, - ck::Tuple, + ck::Tuple, OutLayout, InDataType, WeiDataType, @@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() in_strides, wei_lengths, wei_strides, - {out_lengths, out_lengths}, - {out_strides, out_strides}, + {out_lengths, bias_lengths}, + {out_strides, bias_strides}, out_lengths, out_strides, filter_strides, @@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() in_strides, wei_lengths, wei_strides, - {out_lengths, out_lengths}, - {out_strides, out_strides}, + {out_lengths, bias_lengths}, + {out_strides, bias_strides}, out_lengths, out_strides, filter_strides, diff --git a/client_example/25_tensor_transforms/CMakeLists.txt b/client_example/25_tensor_transforms/CMakeLists.txt new file mode 100644 index 0000000000..d1543fb0ef --- /dev/null +++ b/client_example/25_tensor_transforms/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(client_tensor_transform tensor_transform.cpp) +target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations) +add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp) +target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations) diff --git a/example/64_tensor_transforms/tensor_transform.cpp b/client_example/25_tensor_transforms/tensor_transform.cpp similarity index 100% rename from example/64_tensor_transforms/tensor_transform.cpp rename to client_example/25_tensor_transforms/tensor_transform.cpp diff --git a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp similarity index 74% rename from example/64_tensor_transforms/tensor_transform_using_wrapper.cpp rename to client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp index df2449e99d..de9fcde0b4 100644 --- a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp +++ b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp @@ -9,7 +9,7 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/sequence.hpp" -#include "tensor_transform_wrapper.hpp" +#include "ck/wrapper/layout.hpp" using DataType = int; @@ -17,7 +17,7 @@ template void Print1d(const Layout& layout) { std::cout << "Print1d" << std::endl; - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++) { std::cout << layout(ck::make_tuple(w)) << " "; } @@ -28,9 +28,9 @@ template void Print2d(const Layout& layout) { std::cout << "Print2d" << std::endl; - for(ck::index_t h = 0; h < ck::tensor_transform_wrapper::size<0>(layout); h++) + for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++) { - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) { std::cout << layout(ck::make_tuple(h, w)) << " "; } @@ -43,15 +43,11 @@ template void Print3dCustom(const Layout& layout) { std::cout << "Print3dCustom" << std::endl; - for(ck::index_t d = 0; - d < ck::tensor_transform_wrapper::size<0>(ck::tensor_transform_wrapper::get<0>(layout)); - d++) + for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++) { - for(ck::index_t h = 0; - h < ck::tensor_transform_wrapper::size<1>(ck::tensor_transform_wrapper::get<0>(layout)); - h++) + for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++) { - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) { std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " "; } @@ -68,7 +64,7 @@ int main() // Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor) // (dims:4,8 strides:1,4) const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}); - const auto layout_4x8_s1x4 = ck::tensor_transform_wrapper::make_layout(shape_4x8); + const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8); std::cout << "dims:4,8 strides:1,4" << std::endl; Print2d(layout_4x8_s1x4); using Cord1x1Type = ck::Tuple, ck::Number<1>>; @@ -77,10 +73,9 @@ int main() // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor) // dims:4,(2,4) strides:2,(1,8) - const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); - const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); - const auto layout_4x2x4_s2x1x8 = - ck::tensor_transform_wrapper::make_layout(shape_4x2x4, strides_s2x1x8); + const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); + const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); + const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8); std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; Print2d(layout_4x2x4_s2x1x8); @@ -92,7 +87,7 @@ int main() const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::make_tuple(ck::Number<2>{}, ck::Number<8>{})); static const auto layout_2x2x2x4_s1x4x2x8 = - ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8); + ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8); std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl; Print2d(layout_2x2x2x4_s1x4x2x8); @@ -108,7 +103,7 @@ int main() ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}), ck::Number<8>{}); static const auto layout_2x2x2x4_s1x4x2x8_nested = - ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested); + ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested); std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl; Print1d(layout_2x2x2x4_s1x4x2x8_nested); diff --git a/dev-requirements.txt b/dev-requirements.txt index 9e7b9f01e1..d5d91f8c27 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,3 +1,3 @@ -ROCmSoftwarePlatform/rocm-recipes +ROCm/rocm-recipes RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build -danmar/cppcheck@2.9 \ No newline at end of file +danmar/cppcheck@2.9 diff --git a/docs/conf.py b/docs/conf.py index 0de590da1a..e441ff1ced 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,23 +4,34 @@ # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -import subprocess +import re from rocm_docs import ROCmDocs +html_theme_options = {"flavor": "list"} -name = "Composable Kernel" -get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt' -version = subprocess.getoutput(get_version) -if len(version) > 0: - name = f"{name} {version}" +with open('../CMakeLists.txt', encoding='utf-8') as f: + match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read()) + if not match: + raise ValueError("VERSION not found!") + version_number = match[1] +left_nav_title = f"Composable Kernel {version_number} Documentation" + +# for PDF output on Read the Docs +project = "Composable Kernel Documentation" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number external_toc_path = "./sphinx/_toc.yml" -docs_core = ROCmDocs(f"{name} Documentation") -docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml") +docs_core = ROCmDocs(left_nav_title) +docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() +external_projects_current_project = "composable_kernel" + mathjax3_config = { 'tex': { 'macros': { diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 1084f94c81..fac9e138e1 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = docBin +OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -778,7 +778,9 @@ WARN_LOGFILE = INPUT = ../../include/ck/tensor_operation/gpu/grid \ ../../include/ck/tensor_operation/gpu/block \ ../../include/ck/tensor_operation/gpu/thread \ - ../../library/include/ck/library/utility + ../../library/include/ck/library/utility \ + ../../include/ck/wrapper + # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/index.rst b/docs/index.rst index 51c0c862ae..8c4aaa2b3d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,6 +34,7 @@ Current CK library are structured into 4 layers: * "Templated Tile Operators" layer * "Templated Kernel and Invoker" layer * "Instantiated Kernel and Invoker" layer +* "Wrapper for tensor transform operations" * "Client API" layer .. image:: data/ck_layer.png @@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order: tutorial_hello_world dockerhub + wrapper Supported_Primitives_Guide API_Reference_Guide Contributors_Guide diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 83dd1e7b1a..c37ba29cec 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -5,6 +5,6 @@ defaults: maxdepth: 6 root: index subtrees: - - caption: About - entries: - - file: license +- caption: About + entries: + - file: license diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index c4ce8be79a..0a65ffc81a 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core>=0.20.0 +rocm-docs-core==0.30.1 sphinxcontrib-bibtex==2.6.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 5852315958..75863c214e 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -16,7 +16,7 @@ beautifulsoup4==4.11.2 # via pydata-sphinx-theme breathe==4.34.0 # via rocm-docs-core -certifi==2022.12.7 +certifi==2023.7.22 # via requests cffi==1.15.1 # via @@ -26,7 +26,7 @@ charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc -cryptography==40.0.2 +cryptography==41.0.6 # via pyjwt deprecated==1.2.13 # via pygithub @@ -42,7 +42,7 @@ fastjsonschema==2.18.0 # via rocm-docs-core gitdb==4.0.10 # via gitpython -gitpython==3.1.35 +gitpython==3.1.37 # via rocm-docs-core idna==3.4 # via requests @@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3 # via # rocm-docs-core # sphinx-book-theme -pygithub==1.58.2 +pygithub==1.58.1 # via rocm-docs-core -pygments==2.14.0 +pygments==2.15.0 # via # accessible-pygments # pydata-sphinx-theme @@ -109,11 +109,11 @@ pyyaml==6.0 # pybtex # rocm-docs-core # sphinx-external-toc -requests==2.28.2 +requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.27.0 +rocm-docs-core==0.30.1 # via -r requirements.in six==1.16.0 # via @@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1 # via rocm-docs-core sphinx-copybutton==0.5.1 # via rocm-docs-core -sphinx-design==0.3.0 +sphinx-design==0.4.1 # via rocm-docs-core sphinx-external-toc==0.3.1 # via rocm-docs-core @@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 # via pydata-sphinx-theme -urllib3==1.26.15 +urllib3==1.26.18 # via requests wrapt==1.15.0 # via deprecated diff --git a/docs/wrapper.rst b/docs/wrapper.rst new file mode 100644 index 0000000000..a2f60b97ae --- /dev/null +++ b/docs/wrapper.rst @@ -0,0 +1,73 @@ +=============== +Wrapper +=============== + +------------------------------------- +Description +------------------------------------- + +.. note:: + + The wrapper is under development and its functionality is limited. + + +CK provides a lightweight wrapper for more complex operations implemented in +the library. It allows indexing of nested layouts using a simple interface +(avoiding complex descriptor transformations) and memory access (using Tensor). + +Example: + +.. code-block:: c + + const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); + const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); + const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8); + + std::array data; + auto tensor = ck::wrapper::make_tensor(&data[0], layout); + + for(ck::index_t w = 0; w < size(tensor); w++) { + tensor(w) = w; + } + + // slice() == slice(0, -1) (whole dimension) + auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice())); + std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl; + for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++) + { + for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++) + { + std::cout << tensor_slice(h, w) << " "; + } + std::cout << std::endl; + } + +Output:: + + dims:2,(2,4) strides:2,(1,8) + 1 5 9 13 17 21 25 29 + 2 6 10 14 18 22 26 30 + +------------------------------------- +Layout +------------------------------------- + +.. doxygenstruct:: ck::wrapper::Layout + +------------------------------------- +Layout helpers +------------------------------------- + +.. doxygenfile:: layout_utils.hpp + +------------------------------------- +Tensor +------------------------------------- + +.. doxygenstruct:: ck::wrapper::Tensor + +------------------------------------- +Tensor helpers +------------------------------------- + +.. doxygenfile:: tensor_utils.hpp diff --git a/example/62_conv_fwd_activ/CMakeLists.txt b/example/62_conv_fwd_activ/CMakeLists.txt index bb95602416..d1f26bbfe1 100644 --- a/example/62_conv_fwd_activ/CMakeLists.txt +++ b/example/62_conv_fwd_activ/CMakeLists.txt @@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS) # ScaleAdd ScaleAdd Relu add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp) add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16) + add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp) + add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16) set(target 1) endif() endforeach() diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp new file mode 100644 index 0000000000..196636f8b5 --- /dev/null +++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +constexpr ck::index_t NDimSpatial = 3; +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using OutDataType = ck::half_t; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using BiasLayout = ck::tensor_layout::convolution::G_K; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; + +using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance; + +namespace { +// Use custom implementation to pass two more tensors for post op +template +bool run_grouped_conv_fwd(bool do_verification, + int init_method, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const HostTensorDescriptor& in_g_n_c_wis_desc, + const HostTensorDescriptor& wei_g_k_c_xs_desc, + const HostTensorDescriptor& out_g_n_k_wos_desc, + const InElementOp& in_element_op, + const WeiElementOp& wei_element_op, + const OutElementOp& out_element_op) +{ + constexpr ck::index_t NumDs = 2; + const ck::index_t G = out_g_n_k_wos_desc.GetLengths()[0]; + const ck::index_t K = out_g_n_k_wos_desc.GetLengths()[2]; + + // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW) + std::array bias_g_k_lengths; + std::array bias_g_k_strides; + // Fill other lenghts than G,K with 1 and strides with 0 + bias_g_k_lengths.fill(1); + bias_g_k_strides.fill(0); + bias_g_k_lengths[0] = G; + bias_g_k_lengths[2] = K; + bias_g_k_strides[0] = K; // stride to G + bias_g_k_strides[2] = 1; // stride to K + const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides); + + // y = relu ( alpha1 * conv(x) + alpha2 * z + bias ) + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + std::array, NumDs> d_tensors = {Tensor(out_g_n_k_wos_desc), + Tensor(broadcasted_bias_desc)}; + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl; + std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl; + + // Make sure that we allocated only G * K values for bias + assert(static_cast(d_tensors[1].mData.size()) == G * K); + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + wei.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + d_tensors[0].GenerateTensorValue(GeneratorTensor_2{-2, 2}); + d_tensors[1].GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + d_tensors[0].GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + d_tensors[1].GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize()); + DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + z_buf.ToDevice(d_tensors[0].mData.data()); + bias_buf.ToDevice(d_tensors[1].mData.data()); + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + const std::array ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()}; + + auto conv = DeviceConvNDFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + ds, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, NumDs>{ + e_g_n_k_wos_lengths, bias_g_k_lengths}, + std::array, NumDs>{ + e_g_n_k_wos_strides, bias_g_k_strides}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error("The device op with the specified compilation parameters does " + "not support this convolution problem."); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops() + G * K + + conv_param.GetOutputByte() / sizeof(OutDataType); + std::size_t num_btype = conv_param.GetByte() + + G * K * sizeof(OutDataType) + conv_param.GetOutputByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(do_verification) + { + auto ref_conv = + ck::tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + out_element_op, + {}, + {}, + d_tensors); + + ref_invoker.Run(ref_argument); + + out_device_buf.FromDevice(out_device.mData.data()); + + return ck::utils::check_err(out_device, out_host, "Error: incorrect results!"); + } + + return true; +} + +} // namespace + +#include "run_convnd_fwd_activ_example.inc" + +int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); } diff --git a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc index 7c20c01066..aa547c870a 100644 --- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc +++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc @@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) // Following shapes are selected to avoid overflow. Expect inf in case of // size increase for some elementwise ops. ck::utils::conv::ConvParam conv_param{ - 3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + 3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; if(argc == 1) { diff --git a/example/64_tensor_transforms/CMakeLists.txt b/example/64_tensor_transforms/CMakeLists.txt deleted file mode 100644 index 9d14a410e3..0000000000 --- a/example/64_tensor_transforms/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_example_executable(example_tensor_transform tensor_transform.cpp) -add_example_executable(example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp) diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp index be2c2395fc..e8dabc9973 100644 --- a/include/ck/host_utility/device_prop.hpp +++ b/include/ck/host_utility/device_prop.hpp @@ -26,7 +26,7 @@ inline std::string get_device_name() } const std::string raw_name(props.gcnArchName); - // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 + // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 static std::map device_name_map = { {"Ellesmere", "gfx803"}, {"Baffin", "gfx803"}, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp index 29d7a2b949..0c8e11a17b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp @@ -14,6 +14,7 @@ #include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle // for sanity check of vector memory access for(index_t i = 0; i < NumATensor; ++i) { - a_mz_stride_[i] = a_ms_ks_strides[i][NumDimM - 1]; - a_kz_stride_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1]; + as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1; + as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1; + as_max_read_elems_[i] = + CalculateMaxRead(a_ms_ks_lengths[i], a_ms_ks_strides[i]); } for(index_t i = 0; i < NumBTensor; ++i) { - b_nz_stride_[i] = b_ns_ks_strides[i][NumDimN - 1]; - b_kz_stride_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1]; + bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1; + bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1; + bs_max_read_elems_[i] = + CalculateMaxRead(b_ns_ks_lengths[i], b_ns_ks_strides[i]); } for(index_t i = 0; i < NumDTensor; ++i) { - ds_nz_stride_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1]; + ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1; + ds_max_read_elems_[i] = + CalculateMaxRead(d_ms_ns_lengths[i], d_ms_ns_strides[i]); } - e_nz_stride_ = e_ms_ns_stride[NumDimM + NumDimN - 1]; + e_nz_consecutive_ = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1; + e_max_write_elems_ = CalculateMaxRead(e_ms_ns_length, e_ms_ns_stride); } // pointers @@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle BElementwiseOperation b_element_op_; CDEElementwiseOperation cde_element_op_; - // Strides for the last M/N/K dimensions of A/B/Ds/E - // for sanity check of vector load/store - std::array a_mz_stride_; - std::array a_kz_stride_; + // Describe whether the last part of a given dimension of A/B/D/E is consecutive + // in the memory or not. + std::array as_mz_consecutive_; + std::array as_kz_consecutive_; + std::array bs_nz_consecutive_; + std::array bs_kz_consecutive_; + std::array ds_nz_consecutive_; + bool e_nz_consecutive_; - std::array b_nz_stride_; - std::array b_kz_stride_; - - std::array ds_nz_stride_; - index_t e_nz_stride_; + std::array as_max_read_elems_; + std::array bs_max_read_elems_; + std::array ds_max_read_elems_; + index_t e_max_write_elems_; }; // Invoker @@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle // check vector load/store { - bool all_valid = true; - + bool valid_as_access = true; static_for<0, NumATensor, 1>{}([&](auto i) { - // vector memory access of A: could be on M or AK1 dimension - if constexpr(ABlockTransferSrcVectorDim == 1) + const bool valid_a_vector_size = + arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0; + const bool valid_a_access_dim_m = + ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i]; + const bool valid_a_access_dim_k = + ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i]; + const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k; + if(!(valid_a_vector_size && valid_a_access_dim)) { - if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) % - ABlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - else - { - if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) % - ABlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } + valid_as_access = false; } }); - - // vector memory access of B: could be on N or BK1 dimension - static_for<0, NumBTensor, 1>{}([&](auto i) { - if constexpr(BBlockTransferSrcVectorDim == 1) - { - if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) % - BBlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - else - { - if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) % - BBlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - }); - - // check vector load of Ds - static_for<0, NumDTensor, 1>{}([&](auto i) { - if(!(arg.ds_nz_stride_[i] == 1 && - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) - { - all_valid = false; - } - }); - - // vector memory access of E: always on NPerBlock dimension - if(!(arg.e_nz_stride_ == 1 && - arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) + if(!valid_as_access) { - all_valid = false; + return false; } - if(!all_valid) + bool valid_bs_access = true; + static_for<0, NumBTensor, 1>{}([&](auto i) { + const bool valid_b_vector_size = + arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0; + const bool valid_b_access_dim_n = + BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i]; + const bool valid_b_access_dim_k = + BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i]; + const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k; + if(!(valid_b_vector_size && valid_b_access_dim)) + { + valid_bs_access = false; + } + }); + if(!valid_bs_access) + { + return false; + } + + bool valid_ds_access = true; + static_for<0, NumDTensor, 1>{}([&](auto i) { + const bool valid_d_vector_size = + arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector read of Ds is always on N dimension. + const bool valid_d_access_dim = arg.ds_nz_consecutive_[i]; + if(!(valid_d_vector_size && valid_d_access_dim)) + { + valid_ds_access = false; + } + }); + if(!valid_ds_access) + { + return false; + } + + const bool valid_e_vector_size = + arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector write of E is always on N dimension. + const bool valid_e_access_dim = arg.e_nz_consecutive_; + if(!(valid_e_vector_size && valid_e_access_dim)) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp index 71ff2ba17d..290abe221a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp @@ -13,6 +13,7 @@ #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle return generate_tuple([&](auto i) { return vec[i]; }, num); }; - const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number{}); + const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number{}); const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number{}); // dimension Ids for M0, M1, ... @@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle typename arithmetic_sequence_gen::type{}; // lengths for M0, M1, ... - const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds); + const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds); // lengths for K0, K1, ... - const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds); + const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds); // naive tensor A[M0, M1, M2, ..., K0, K1, K2...] const auto a_grid_desc_ms_ks = - make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides); + make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides); // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...] const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor( @@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b_grid, std::array p_ds_grid, void* p_e_grid, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b_grid_{static_cast(p_b_grid)}, p_ds_grid_{}, p_e_grid_{static_cast(p_e_grid)}, - a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)}, + a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ks_lengths, a_ms_ks_strides)}, b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)}, @@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)}, a_element_op_{a_element_op}, b_element_op_{b_element_op}, - cde_element_op_{cde_element_op}, - a_mz_stride_{}, - a_kz_stride_{}, - b_nz_stride_{}, - b_kz_stride_{}, - ds_nz_stride_{}, - e_nz_stride_{} + cde_element_op_{cde_element_op} { // populate pointer, batch stride, desc for Ds static_for<0, NumDTensor, 1>{}([&](auto i) { @@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle } // for sanity check of vector memory access - a_mz_stride_ = a_ms_ks_strides[NumDimM - 1]; - a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1]; + a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1; + a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1; + a_max_read_elems_ = + CalculateMaxRead(a_ms_ks_lengths, a_ms_ks_strides); - b_nz_stride_ = b_ns_ks_strides[NumDimN - 1]; - b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1]; + b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1; + b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1; + b_max_read_elems_ = + CalculateMaxRead(b_ns_ks_lengths, b_ns_ks_strides); for(index_t i = 0; i < NumDTensor; ++i) { - ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1]; + ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1; + ds_max_read_elems_[i] = + CalculateMaxRead(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]); } - e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1]; + e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1; + e_max_write_elems_ = + CalculateMaxRead(e_ms_ns_lengths, e_ms_ns_strides); } void Print() const @@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle BElementwiseOperation b_element_op_; CDEElementwiseOperation cde_element_op_; - // Strides for the last M/N/K dimensions of A/B/Ds/E - // for sanity check of vector load/store - index_t a_mz_stride_; - index_t a_kz_stride_; - index_t b_nz_stride_; - index_t b_kz_stride_; - std::array ds_nz_stride_; - index_t e_mz_stride_; - index_t e_nz_stride_; + // Describe whether the last part of a given dimension of A/B/D/E is consecutive + // in the memory or not. + bool a_mz_consecutive_; + bool a_kz_consecutive_; + bool b_nz_consecutive_; + bool b_kz_consecutive_; + std::array ds_nz_consecutive_; + bool e_nz_consecutive_; + + index_t a_max_read_elems_; + index_t b_max_read_elems_; + std::array ds_max_read_elems_; + index_t e_max_write_elems_; }; // Invoker @@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2), "wrong!"); - // vector memory access of A: could be on M or AK1 dimension - if constexpr(ABlockTransferSrcVectorDim == 1) - { - if(!(arg.a_mz_stride_ == 1 && - arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - else - { - if(!(arg.a_kz_stride_ == 1 && - arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - - // vector memory access of B: could be on N or BK1 dimension - if constexpr(BBlockTransferSrcVectorDim == 1) - { - if(!(arg.b_nz_stride_ == 1 && - arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - else - { - if(!(arg.b_kz_stride_ == 1 && - arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - - // vector memory access of Ds: always on NPerBlock dimension - bool valid_d_access = true; - - static_for<0, NumDTensor, 1>{}([&](auto i) { - if(!(arg.ds_nz_stride_[i] == 1 && - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) - { - valid_d_access = false; - } - }); - - if(valid_d_access == false) + const bool valid_a_vector_size = + arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0; + const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_; + const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_; + const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k; + if(!(valid_a_vector_size && valid_a_access_dim)) { return false; } - // vector memory access of E: always on NPerBlock dimension - if(!(arg.e_nz_stride_ == 1 && - arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) + const bool valid_b_vector_size = + arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0; + const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_; + const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_; + const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k; + if(!(valid_b_vector_size && valid_b_access_dim)) + { + return false; + } + + bool valid_ds_access = true; + static_for<0, NumDTensor, 1>{}([&](auto i) { + const bool valid_d_vector_size = + arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector read of Ds is always on N dimension. + const bool valid_d_access_dim = arg.ds_nz_consecutive_[i]; + if(!(valid_d_vector_size && valid_d_access_dim)) + { + valid_ds_access = false; + } + }); + if(!valid_ds_access) + { + return false; + } + + const bool valid_e_vector_size = + arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector write of E is always on N dimension. + const bool valid_e_access_dim = arg.e_nz_consecutive_; + if(!(valid_e_vector_size && valid_e_access_dim)) { return false; } @@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b, std::array p_ds, void* p_e, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b, p_ds, p_e, - a_ms_ns_lengths, + a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, @@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b, std::array p_ds, void* p_e, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b, p_ds, p_e, - a_ms_ns_lengths, + a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp new file mode 100644 index 0000000000..0e14b40942 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +/** + * Calculates the maximum number of subsequent elements of the fast changing dimension + * that are consecutive in memory. + * + * Example: + * NumDimM = 2, NumDimK = 3 + * A shape = [ 2, 3, 4, 5, 6] + * A strides = [360, 120, 30, 6, 1] + * | M | | K | + * It follows from strides that K is FCD and all the subsequent elements of K are consecutive + * in memory. + * But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be + * consecutive in memory. + * + * Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions. + */ +template +auto CalculateMaxRead(const std::vector& lengths, const std::vector& strides) +{ + if(lengths.size() != NumDim1 + NumDim2) + { + std::ostringstream err; + err << "Incorrect number of lengths in " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + if(strides.size() != NumDim1 + NumDim2) + { + std::ostringstream err; + err << "Incorrect number of strides in " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + // Determine the beginning and end idx of the group representing the FCD. + index_t begin_idx, end_idx; + if(strides[NumDim1 - 1] == 1) + { + begin_idx = 0; + end_idx = NumDim1 - 1; + } + else if(strides[NumDim1 + NumDim2 - 1] == 1) + { + begin_idx = NumDim1; + end_idx = NumDim1 + NumDim2 - 1; + } + else + { + // The dimension consecutive in memory is not the last dimension of any group, so only + // one element can be read/written at once. + return 1; + } + + index_t consecutive_stride = 1; + for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx) + { + if(strides[dim_idx] == consecutive_stride) + { + consecutive_stride *= lengths[dim_idx]; + } + else + { + break; + } + } + const index_t max_subsequent_elems = consecutive_stride; + return max_subsequent_elems; +} + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 26224b5dec..4afef85d8c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle return out_gemmm_gemmn_desc; } + // Shape of Ds and E must be aligned. Strides can be different. + // Pass e_g_n_k_wos_lengths for logical broadcast. static auto MakeDsGridDescriptor_M_N( - const std::array, NumDTensor>& ds_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_lengths, const std::array, NumDTensor>& ds_g_n_k_wos_strides) { return generate_tuple( [&](auto i) { using DLayout = remove_cvref_t>; - return DeviceOp::MakeEGridDescriptor_M_N(ds_g_n_k_wos_lengths[i], + return DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]); }, Number{}); @@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // D desc ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N( - ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]); + e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]); }); compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0]; @@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || - is_same_v || is_same_v || - is_same_v) + is_same_v || is_same_v) { const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; @@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { valid = false; } + + if constexpr(is_same_v) + { + // G and K must be the same + if(arg.ds_g_n_k_wos_lengths_[i][0] != arg.e_g_n_k_wos_lengths_[0] || + arg.ds_g_n_k_wos_lengths_[i][2] != arg.e_g_n_k_wos_lengths_[2]) + { + valid = false; + } + } + else + { + // E and D must have the same shape + for(index_t d = 0; d < NDimSpatial + 3; d++) + { + if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d]) + { + valid = false; + } + } + } } else { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 80a5d0e97a..0050a5b281 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || - is_same_v || is_same_v || - is_same_v) + is_same_v || is_same_v) { const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index b2d141fd61..ecc71ba2f2 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout static constexpr const char* name = "GNDHWK"; }; -// for output bias -struct GK : public BaseTensorLayout -{ - static constexpr const char* name = "GK"; -}; - // output tensor // packed NWGK/NHWGK/NDHWGK struct NWGK : public BaseTensorLayout diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp index f77ffff350..9535ca69a9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp @@ -50,7 +50,9 @@ __global__ void ignore = p_in_global; ignore = out_grid_desc; ignore = p_out_global; + ignore = batch_count; ignore = block_2_tile_map; + ignore = compute_ptr_offset_of_batch; #endif } diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index 6f546f1d6d..e2f75142d4 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -522,22 +522,21 @@ struct TransformConvFwdToGemm // for output bias template || - is_same_v, + typename std::enable_if, bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& c_g_n_k_wos_lengths, - const std::array& /* c_g_n_k_wos_strides */) + static auto MakeCDescriptor_M_N(const std::array& c_g_n_k_wos_lengths, + const std::array& c_g_n_k_wos_strides) { - const index_t N = c_g_n_k_wos_lengths[1]; - const index_t K = c_g_n_k_wos_lengths[2]; + const index_t N = c_g_n_k_wos_lengths[1]; + const index_t K = c_g_n_k_wos_lengths[2]; + const index_t KStride = c_g_n_k_wos_strides[2]; const index_t NHoWo = N * ck::accumulate_n( c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); const auto out_gemmm_gemmn_desc = - make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride)); return out_gemmm_gemmn_desc; } diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index d7b492fe66..75f2693f20 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple&) return (is_detected::value || ...); } +template +__host__ __device__ constexpr auto TupleDepth(const T&) +{ + return depth; +} + +template +__host__ __device__ constexpr auto TupleDepth(const Tuple&) +{ + return math::max(TupleDepth(Ts{})...); +} + } // namespace ck diff --git a/example/64_tensor_transforms/tensor_transform_wrapper.hpp b/include/ck/wrapper/layout.hpp similarity index 58% rename from example/64_tensor_transforms/tensor_transform_wrapper.hpp rename to include/ck/wrapper/layout.hpp index 71cd6091f8..f20d985b49 100644 --- a/example/64_tensor_transforms/tensor_transform_wrapper.hpp +++ b/include/ck/wrapper/layout.hpp @@ -3,27 +3,13 @@ #pragma once -#include "ck/ck.hpp" - -#include "ck/utility/number.hpp" -#include "ck/utility/tuple.hpp" -#include "ck/utility/tuple_helper.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/utility/sequence_helper.hpp" -#include "ck/utility/is_detected.hpp" - -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/wrapper/utils/layout_utils.hpp" namespace ck { -namespace tensor_transform_wrapper { +namespace wrapper { /** - * \brief Layout wrapper - * - * \details - * Layout wrapper that performs the tensor descriptor logic. + * \brief Layout wrapper that performs the tensor descriptor logic. * * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes @@ -32,21 +18,39 @@ namespace tensor_transform_wrapper { * (dynamic layout). Stride tuple should be nested if shape tuple is * nested. */ -template > +template struct Layout { private: static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; - template - using is_tuple = decltype(std::declval().IsTuple()); + // Generate default idxs tuple (idx with all merged nested shapes) + template + __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple&) + { + return generate_tuple( + [&](auto) { + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + { + // runtime layout + return index_t(0); + } + else + { + // compiletime layout + return I0; + } + }, + Number::Size()>{}); + } // Generate packed (column-major) strides if not passed template __host__ __device__ constexpr static auto - GenerateColumnMajorPackedStrides(const Tuple& tuple) + GenerateColumnMajorPackedStrides(const Tuple& shape) { + const auto unrolled_shape = UnrollNestedTuple(shape); return generate_tuple( [&](auto i) { if constexpr(i.value == 0) @@ -56,10 +60,10 @@ struct Layout else { return TupleReduce([](auto x, auto y) { return x * y; }, - tuple); + unrolled_shape); } }, - Number::Size()>{}); + Number{}); } // Generate LowerDims in Compile-time for MergeTrasform using passed Type @@ -112,8 +116,8 @@ struct Layout // Example shape: (2, (2, 2)), 2, (2, 2) // Unrolled shape: 2, (2, 2), 2, (2, 2) template - __host__ __device__ constexpr static auto UnrollShapeViaIdx(const Tuple& shape, - const Tuple& idx) + __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple& shape, + const Tuple& idx) { if constexpr(!IsNestedTuple(Tuple{})) { @@ -125,7 +129,7 @@ struct Layout // Iterate over shape tuple elements: // 1. If corresponding idx element is tuple then return (will be unrolled) // 2. If no, pack in tuple. It will be restored during unroll. - auto unrolled_shape_via_idx = generate_tuple( + auto aligned_shape = generate_tuple( [&](auto i) { if constexpr(is_detected>>::value) @@ -140,37 +144,34 @@ struct Layout Number::Size()>{}); // Unroll and process next step - return UnrollShapeViaIdx(UnrollNestedTuple<0, 1>(unrolled_shape_via_idx), - UnrollNestedTuple<0, 1>(idx)); + return AlignShapeToIdx(UnrollNestedTuple<0, 1>(aligned_shape), + UnrollNestedTuple<0, 1>(idx)); } } template __host__ __device__ constexpr static auto MakeMerge1d(const Tuple& shape, - DescriptorToMerge& desc) + const DescriptorToMerge& desc) { // Reverse each element in tuple - using ReversedUnrolledShape = decltype(TupleReverse(UnrollNestedTuple(shape))); - const auto merge_elems = ReversedUnrolledShape{}; - + const auto merge_elems = TupleReverse(UnrollNestedTuple(shape)); // Generate reverted indexes (column major traverse) - using MergeElemsSequence = - typename arithmetic_sequence_gen<0, ReversedUnrolledShape::Size(), 1>::type; - const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); - const auto upper_dims = make_tuple(Sequence<0>{}); + using MergeElemsSequence = typename arithmetic_sequence_gen<0, merge_elems.Size(), 1>::type; + const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); + const auto upper_dims = make_tuple(Sequence<0>{}); // Merge to 1d return transform_tensor_descriptor( desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); } - // Merge nested shape dims + // Merge nested shape dims when corresponding index is also nested. // Input desc shape: 2, 2, 2, 2, 2, 2 // Example idx: 1, 1, 1, 1 // Example shape: 2, (2, 2), 2, (2, 2) // Merged shape: 2, 4, 2, 4 template - __host__ __device__ constexpr static auto - MakeMerges(const Tuple& shape, const Tuple&, DescriptorToMerge& desc) + __host__ __device__ constexpr static auto CreateMergedDescriptor( + const Tuple& shape, const Tuple&, DescriptorToMerge& desc) { const auto transforms = generate_tuple( [&](auto i) { @@ -206,14 +207,38 @@ struct Layout return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims); } + template + __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape, + const LayoutStrides& strides) + { + const auto unrolled_shape = UnrollNestedTuple(shape); + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } + + // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. + using DeducedStrides = + std::conditional_t>, + remove_cvref_t, + Strides>; + using FlattenDescriptorType = + remove_cvref_t; + using Descriptor1dType = + remove_cvref_t; + using DefaultIdxsTupleType = remove_cvref_t; + template - __host__ __device__ constexpr auto TransformDesc(const Tuple& shape, - const Tuple& idx) const + __host__ __device__ constexpr static auto + TransformDesc(const Tuple& shape, + const Tuple& idx, + const FlattenDescriptorType& naive_descriptor) { if constexpr(Tuple::Size() == I1) { // 1d idx path - return MakeMerge1d(shape, descriptor_); + return MakeMerge1d(shape, naive_descriptor); } else { @@ -224,62 +249,55 @@ struct Layout static_assert(Tuple::Size() == Tuple::Size(), "Idx rank and Shape rank must be the same (except 1d)."); // Unroll while IdxDims is nested - const auto unrolled_shape_via_idx = UnrollShapeViaIdx(shape, idx); + const auto aligned_shape = AlignShapeToIdx(shape, idx); // Transform correct form of shape - return MakeMerges(unrolled_shape_via_idx, UnrollNestedTuple(idx), descriptor_); + return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor); } } - template - __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape, - const LayoutStrides& strides) - { - const auto unrolled_shape = UnrollNestedTuple(shape); - - if constexpr(ck::is_same_v>) - { - // If shape is packed - const auto column_major_packed_strides = - GenerateColumnMajorPackedStrides(unrolled_shape); - return make_naive_tensor_descriptor(unrolled_shape, column_major_packed_strides); - } - else - { - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } - } + using MergedNestsDescriptorType = remove_cvref_t; public: - using NaiveDescriptorType = remove_cvref_t; + __host__ __device__ constexpr auto GetElementSpaceSize() const + { + return flatten_descriptor_.GetElementSpaceSize(); + } + __host__ __device__ Layout() = delete; /** * \brief Layout constructor. * * \param shape Shape for layout. * \param strides Strides for layout (optional if tensor is packed). - * \return Layout object. */ - __host__ __device__ Layout() = delete; - __host__ __device__ Layout(const Shape& shape, const Strides& strides) : descriptor_{} + __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides) + : flatten_descriptor_{}, shape_(shape), strides_(strides) { // Construct if runtime mode - if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { - // Keep only shape, strides are not need for transforms - shape_ = shape; - descriptor_ = MakeNaiveDescriptor(shape, strides); + flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); + descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + merged_nests_descriptor_ = + TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } } - __host__ __device__ Layout(const Shape& shape) : descriptor_{} + /** + * \brief Layout constructor (with default packed column-major strides). + * + * \param shape Shape for layout. + */ + __host__ __device__ constexpr Layout(const Shape& shape) + : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_)) { - if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { - shape_ = shape; - descriptor_ = MakeNaiveDescriptor(shape, Strides{}); + flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); + descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + merged_nests_descriptor_ = + TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } } @@ -292,7 +310,9 @@ struct Layout template __host__ __device__ constexpr index_t operator()() const { - using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{})); + static_assert(FlattenDescriptorType::IsKnownAtCompileTime(), + "Compiletime operator used on runtime layout."); + using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); } @@ -306,9 +326,22 @@ struct Layout template __host__ __device__ index_t operator()(const Tuple& Idx) const { - // Static to construct transformed_desc only once - static const auto transformed_desc = TransformDesc(shape_, Idx); - return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); + if constexpr(!IsNestedTuple(Tuple{}) && Tuple::Size() == 1) + { + // if 1d access + return descriptor_1d_.CalculateOffset(Idx); + } + else if constexpr(!IsNestedTuple(Tuple{}) && Tuple::Size() == Shape::Size()) + { + // if Shape::Size() access (merged nested shapes) + return merged_nests_descriptor_.CalculateOffset(UnrollNestedTuple(Idx)); + } + else + { + // Custom index, need to transform descriptor + const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_); + return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); + } } /** @@ -338,7 +371,7 @@ struct Layout * * \return Calculated size. */ - __host__ __device__ constexpr index_t GetLength() const + __host__ __device__ constexpr index_t GetLengths() const { const auto unrolled_shape = UnrollNestedTuple(shape_); return TupleReduce([](auto x, auto y) { return x * y; }, @@ -346,80 +379,56 @@ struct Layout } /** - * \brief Dimension getter. + * \brief Shape getter. * - * \tparam IDim Dimension idx. - * \return Calculated size. + * \return Shape. */ - template - __host__ __device__ constexpr auto Get() const + __host__ __device__ constexpr const Shape& GetShape() const { return shape_; } + + /** + * \brief Strides getter. + * + * \return Strides. + */ + __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; } + + /** + * \brief Get default lengths (tuple filled with Shape length elements). + * + * \return Default lengths. + */ + __host__ __device__ constexpr auto GetDefaultLengthsTuple() const { - const auto elem = shape_.At(Number{}); - return elem; + return generate_tuple([&](auto i) { return GetLength(); }, Number{}); + } + + /** + * \brief Get default start idx (tuple filled with 0s of the same size as Shape). + * + * \return Default start idx. + */ + __host__ __device__ constexpr auto GetDefaultStartIdxs() const + { + return GenerateDefaultIdxsTuple(shape_); + } + + /** + * \brief Get default descriptor (with the same size as Shape) + * + * \return Default descriptor. + */ + __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor() + { + return merged_nests_descriptor_; } private: - NaiveDescriptorType descriptor_; - Shape shape_; + FlattenDescriptorType flatten_descriptor_; + Descriptor1dType descriptor_1d_; + MergedNestsDescriptorType merged_nests_descriptor_; + const Shape shape_; + const DeducedStrides strides_; }; -// Layout helpers -// Length getter (product if tuple) -template -__host__ __device__ constexpr index_t size(const Layout& layout) -{ - return layout.template GetLength(); -} - -// Get shape size (product of dims if tuple) -template -__host__ __device__ constexpr index_t size(const Tuple& shape) -{ - using UnrolledShape = decltype(UnrollNestedTuple(shape)); - return TupleReduce<0, UnrolledShape::Size()>([](auto x, auto y) { return x * y; }, - UnrolledShape{}); -} - -// Get dim size (could be returned from get function) -template -__host__ __device__ T constexpr size(const T& dim) -{ - return dim; -} - -// Get layout size (product of shapes) -template -__host__ __device__ constexpr index_t size(const Layout& layout) -{ - return layout.GetLength(); -} - -// Get shape element size -template -__host__ __device__ constexpr index_t size(const Tuple& shape) -{ - return size(shape.At(Number{})); -} - -// Dim getter (tuple if tuple) -template -__host__ __device__ constexpr auto get(const Layout& layout) -{ - return layout.template Get(); -} - -template -__host__ __device__ constexpr Layout make_layout(const Shape& shape, - const Strides& strides) -{ - return Layout(shape, strides); -} - -template -__host__ __device__ constexpr Layout make_layout(const Shape& shape) -{ - return Layout(shape); -} - -} // namespace tensor_transform_wrapper +} // namespace wrapper } // namespace ck diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp new file mode 100644 index 0000000000..4ec6498fbc --- /dev/null +++ b/include/ck/wrapper/tensor.hpp @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "utils/tensor_utils.hpp" +#include "utils/layout_utils.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Tensor wrapper that performs static and dynamic buffer logic. + * + * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). + * \tparam ElementType Element data type. + * \tparam Shape Tensor shape (layout component). + * \tparam Strides Tensor strides (layout component). + * \tparam NumVectors Number of vectors (only for VGPR, SGPR). + * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). + */ +template +struct Tensor +{ + private: + // Check if Tuple contains Slice object + template + constexpr static bool IsSlicing(T&&) + { + return is_detected::value; + } + template + constexpr static bool IsSlicing(Tuple&&) + { + return (IsSlicing(Ts{}) || ...); + } + + // Calculate first index of new tensor after slice + // It is needed to calculate offset for new tensor + template + constexpr auto GetStartIdxForSlicedTensor(const Tuple& idx) const + { + const auto start_idx_for_sliced_tensor = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return GetStartIdxForSlicedTensor(idx.At(num_i)); + } + else if constexpr(is_detected>>::value) + { + // if slice, return the beginning of the interval + return idx.At(num_i).from_; + } + else + { + // if one dim selected + return idx.At(num_i); + } + }, + Number::Size()>{}); + + return start_idx_for_sliced_tensor; + } + + // Calculate new tensor shape after slice + template + constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape) const + { + // Pack each value in tuple to remove empty tuples after generation + auto new_shape = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + if constexpr(!IsSlicing(tuple_element_t>{})) + { + // if tuple does not have any slice then we can remove dimension + return Tuple<>{}; + } + else + { + // if tuple then recurrence + return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i))); + } + } + else if constexpr(is_detected>>::value) + { + // calculate new dimension + const auto& dim = size(shape.At(num_i)); + const auto val = idx.At(num_i).range(dim); + return make_tuple(val); + } + else + { + // remove dimension for just value + return Tuple<>{}; + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple<0, 1>(new_shape); + } + + template + constexpr auto GetStridesFromSlicedTensor(const Tuple& idx, + const StridesTmpType& strides) const + { + // Pack each value in tuple to remove empty tuples after generation + auto new_strides = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + if constexpr(!IsSlicing(tuple_element_t>{})) + { + // if tuple does not have any slice then we can remove dimension + return Tuple<>{}; + } + else + { + // if tuple then recurrence + return make_tuple( + GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i))); + } + } + else if constexpr(is_detected>>::value) + { + // Stride will be the same + return make_tuple(strides.At(num_i)); + } + else + { + // remove dimension for just value + return Tuple<>{}; + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple<0, 1>(new_strides); + } + + public: + using ElementSpaceSize = decltype(Layout{ + Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using TensorElementType = ElementType; // DataType + + static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; + static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr || + BufferAddressSpace == MemoryTypeEnum ::Vgpr); + + __host__ __device__ Tensor() = delete; + __host__ __device__ Tensor(ElementType* pointer, const Layout& layout) + : layout_(layout), + buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) + { + } + + __host__ __device__ Tensor(const Layout& layout) : layout_(layout) + { + static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); + } + + __host__ __device__ constexpr const Layout& GetLayout() const + { + return layout_; + } + + // Getter for new sliced tensor + template {}), bool> = false> + __host__ __device__ auto operator[](const Tuple& idx) const + { + static_assert(IsDynamicBuffer, "Register slice is not supported"); + // Calculate offset based on first idx for new tensor + const index_t offset = layout_(GetStartIdxForSlicedTensor(idx)); + + auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape()); + if constexpr(is_same_v>) + { + auto new_layout = make_layout(new_shape); + return make_tensor(buffer_.p_data_ + offset, new_layout); + } + else + { + auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides()); + auto new_layout = make_layout(new_shape, new_strides); + return make_tensor(buffer_.p_data_ + offset, new_layout); + } + } + + template {}), bool> = false> + __host__ __device__ auto operator()(const Tuple& idx) const + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ auto operator()(Idxs... idxs) const + { + return this->operator[](make_tuple(idxs...)); + } + + // Getter for the const value + template {}), bool> = false> + __host__ __device__ const ElementType& operator[](const Tuple& idx) const + { + if constexpr(IsDynamicBuffer) + { + const index_t offset = layout_(idx); + return buffer_[offset]; + } + else + { + if constexpr(is_same_v>) + { + constexpr index_t offset = + Layout{Shape{}}.template operator()>(); + return buffer_[Number{}]; + } + else + { + constexpr index_t offset = + Layout{Shape{}, Strides{}}.template operator()>(); + return buffer_[Number{}]; + } + } + } + + template {}), bool> = false> + __host__ __device__ const ElementType& operator()(const Tuple& idx) const + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ const ElementType& operator()(Idxs... idxs) const + { + return this->operator[](make_tuple(idxs...)); + } + + // Getter for the value reference + template {}), bool> = false> + __host__ __device__ ElementType& operator[](const Tuple& idx) + { + if constexpr(IsDynamicBuffer) + { + const index_t offset = layout_(idx); + return buffer_(offset); + } + else + { + if constexpr(is_same_v>) + { + constexpr index_t offset = + Layout{Shape{}}.template operator()>(); + return buffer_(Number{}); + } + else + { + constexpr index_t offset = + Layout{Shape{}, Strides{}}.template operator()>(); + return buffer_(Number{}); + } + } + } + + template {}), bool> = false> + __host__ __device__ ElementType& operator()(const Tuple& idx) + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ ElementType& operator()(Idxs... idxs) + { + return this->operator[](make_tuple(idxs...)); + } + + __host__ __device__ constexpr auto GetDefaultDescriptor() + { + return layout_.GetDefaultDescriptor(); + } + + private: + using DynamicBufferType = DynamicBuffer; + using StaticBufferType = + StaticBufferTupleOfVector; + // If register use static buffer, else use dynamic buffer + using Buffer = std::conditional_t; + + const Layout layout_; + Buffer buffer_; +}; + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp new file mode 100644 index 0000000000..5df9dd7dea --- /dev/null +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" + +#include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/tuple_helper.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/utility/sequence_helper.hpp" +#include "ck/utility/is_detected.hpp" + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" + +namespace ck { +namespace wrapper { + +// Disable from doxygen docs generation +/// @cond +// forward declaration +template +struct Layout; + +template +using is_tuple = decltype(std::declval().IsTuple()); +/// @endcond + +// make_* +/** + * \brief Make layout function. + * + * \tparam Shape Shape for layout. + * \tparam Strides Strides for layout. + * \return Constructed layout. + */ +template +__host__ __device__ constexpr Layout make_layout(const Shape& shape, + const Strides& strides) +{ + return Layout(shape, strides); +} + +/** + * \brief Make layout function with packed strides + * (column-major). + * + * \tparam Shape Shape for layout. + * \return Constructed layout. + */ +template +__host__ __device__ constexpr Layout> make_layout(const Shape& shape) +{ + return Layout>(shape); +} + +// Layout helpers +// get +// Get dim (could be returned from get with empty Idxs) +/** + * \private + */ +template +__host__ __device__ T constexpr get(const T& dim) +{ + return dim; +} + +/** + * \brief Get element from tuple (Shape/Strides/Idxs). + * + * \tparam idx Index to lookup. + * \param tuple Tuple to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto get(const Tuple& tuple) +{ + return tuple.At(Number{}); +} + +/** + * \brief Get sub layout. + * + * \tparam idx Index to lookup. + * \param layout Layout to create sub layout. + * \return Requsted sub layout. + */ +template +__host__ __device__ constexpr auto get(const Layout& layout) +{ + const auto& shape = layout.GetShape(); + const auto& new_shape = get(shape); + static_assert(is_detected::value, + "Shape of sub layout must be tuple"); + if constexpr(is_same_v>) + { + // If stride not passed, create without strides + return make_layout(new_shape); + } + else + { + const auto& strides = layout.GetStrides(); + const auto& new_strides = get(strides); + static_assert(is_detected::value, + "Strides of sub layout must be tuple"); + return make_layout(new_shape, new_strides); + } +} + +/** + * \brief Hierarchical get. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto get(const T& elem) +{ + return get(get(elem)); +} + +// size +// Get dim size (could be returned from get function) +/** + * \private + */ +template +__host__ __device__ T constexpr size(const T& dim) +{ + return dim; +} + +/** + * \brief Length get (product if tuple). + * + * \tparam idx Index to lookup. + * \param layout Layout to get Shape of. + * \return Requsted length. + */ +template +__host__ __device__ constexpr index_t size(const Layout& layout) +{ + return layout.template GetLength(); +} + +/** + * \brief Shape size (product of dims). + * + * \param shape Shape to lookup. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t size(const Tuple& shape) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; }, + unrolled_shape); +} + +/** + * \brief Layout size (product of dims). + * + * \param layout Layout to calculate shape size. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t size(const Layout& layout) +{ + return layout.GetLengths(); +} + +/** + * \brief Length get from tuple (product if tuple). + * + * \tparam idx Index to lookup. + * \param tuple Tuple to lookup. + * \return Requsted length. + */ +template +__host__ __device__ constexpr index_t size(const Tuple& tuple) +{ + return size(tuple.At(Number{})); +} + +/** + * \brief Hierarchical size. + * + * \tparam Idx First index to lookup (to avoid empty Idxs). + * \tparam Idxs Next indexes to lookup. + * \param elem Element to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto size(const T& elem) +{ + return size(get(elem)); +} + +// rank +/** + * \brief Get layout rank (num elements in shape). + * + * \param layout Layout to calculate rank. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout& layout) +{ + return Shape::Size(); +} + +/** + * \brief Get tuple rank (num elements in tuple). + * Return 1 if scalar passed. + * + * \param tuple Tuple to calculate rank. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple& tuple) +{ + return Tuple::Size(); +} + +/** + * \private + */ +template +__host__ __device__ constexpr index_t rank(const Number&) +{ + return 1; +} + +/** + * \private + */ +__host__ __device__ constexpr index_t rank(const index_t&) { return 1; } + +/** + * \brief Hierarchical rank. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank(const T& elem) +{ + return rank(get(elem)); +} + +// depth +/** + * \brief Get depth of the layout shape (return 0 if scalar). + * + * \param layout Layout to calculate depth. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const Layout& layout) +{ + const auto& shape = layout.GetShape(); + return TupleDepth(shape); +} + +/** + * \brief Get depth of the tuple. (return 0 if scalar) + * + * \param tuple Tuple to calculate depth. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const Tuple& tuple) +{ + return TupleDepth(tuple); +} + +/** + * \private + */ +template +__host__ __device__ constexpr index_t depth(const Number&) +{ + return 0; +} + +/** + * \private + */ +__host__ __device__ constexpr index_t depth(const index_t&) { return 0; } + +/** + * \brief Hierarchical depth. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const T& elem) +{ + return depth(get(elem)); +} + +/** + * \brief Get Layout strides. + * + * \param layout Layout to get strides from. + * \return Requsted strides. + */ +template +__host__ __device__ constexpr const auto& stride(const Layout& layout) +{ + return layout.GetStrides(); +} + +/** + * \brief Get Layout shape. + * + * \param layout Layout to get shape from. + * \return Requsted shape. + */ +template +__host__ __device__ constexpr const auto& shape(const Layout& layout) +{ + return layout.GetShape(); +} + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp new file mode 100644 index 0000000000..5f0dc3e500 --- /dev/null +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" + +#include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/tuple_helper.hpp" +#include "ck/utility/dynamic_buffer.hpp" +#include "ck/utility/amd_address_space.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Memory type, allowed members: + * - Generic, + * - Global, + * - LDS, + * - SGPR, + * - VGPR, + */ +using MemoryTypeEnum = AddressSpaceEnum; + +// Disable from doxygen docs generation +/// @cond +// forward declarations +template +struct Layout; +template + +struct Tensor; + +template +struct Slice +{ + __host__ __device__ constexpr Slice() : from_(), to_() {} + __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {} + + template + __host__ __device__ constexpr auto range(const T& dim) const + { + if constexpr(is_same_v || is_same_v || + is_same_v) + { + assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range"); + if(to_ < 0) + { + return dim - from_ + to_ + 1; + } + else + { + // workaround if one end of the interval is index_t and the second one is Number + return static_cast(to_) - static_cast(from_); + } + } + else + { + static_assert(dim >= to_ && from_ >= Number<0>{} && (to_ < 0 || to_ > from_), + "Invalid range"); + if constexpr(to_ < 0) + { + return dim - from_ + to_ + Number<1>{}; + } + else + { + return to_ - from_; + } + } + } + + __host__ __device__ static constexpr bool IsSlice() { return true; } + + const FromType from_; + const ToType to_; +}; + +template +using is_slice = decltype(std::declval().IsSlice()); + +template +using is_tuple = decltype(std::declval().IsTuple()); +/// @endcond + +/** + * \brief Make tensor function. + * + * \tparam MemoryType Type of memory. + * \param pointer Pointer to the memory. + * \param layout Tensor layout. + * \return Constructed tensor. + */ +template +constexpr auto make_tensor(ElementType* pointer, const Layout& layout) +{ + return Tensor( + pointer, layout); +} + +/** + * \brief Make SGPR or VGPR tensor function. + * + * \tparam MemoryType Type of memory. + * \tparam NumVectors Number of vectors. + * \tparam ScalarPerVector Scalars per vector. + * \tparam ElementType Memory data type. + * \param layout Tensor layout. + * \return Constructed tensor. + */ +template +constexpr auto make_register_tensor(const Layout& layout) +{ + static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported"); + return Tensor(layout); +} + +/** + * \brief Get Tensor Layout. + * + * \param tensor Tensor to get layout of. + * \return Requsted layout. + */ +template +__host__ __device__ constexpr const auto& +layout(const Tensor& + tensor) +{ + return tensor.GetLayout(); +} + +/** + * \brief Product of tensor shape dims. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get Shape of. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t +size(const Tensor& + tensor) +{ + return size(tensor.GetLayout()); +} + +/** + * \brief Rank of Shape tuple. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get rank of. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr index_t +rank(const Tensor& + tensor) +{ + return rank(tensor.GetLayout()); +} + +/** + * \brief Depth of Shape tuple. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get depth of. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr index_t +depth(const Tensor& + tensor) +{ + return depth(tensor.GetLayout()); +} + +/** + * \brief Get Tensor strides. + * + * \param tensor Tensor to get strides from. + * \return Requsted strides. + */ +template +__host__ __device__ constexpr const auto& +stride(const Tensor& + tensor) +{ + return stride(tensor.GetLayout()); +} + +/** + * \brief Get Tensor shape. + * + * \param tensor Tensor to get shape from. + * \return Requsted shape. + */ +template +__host__ __device__ constexpr const auto& +shape(const Tensor& + tensor) +{ + return shape(tensor.GetLayout()); +} + +/** + * \brief Get dim slice. + * + * \param from Beginning of the interval. + * \param to End of the interval. (could be also negative to index from the end) + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +template +constexpr auto slice(const FromType from, const ToType to) +{ + return Slice(from, to); +} + +/** + * \brief Get dim slice. (Assumed that from is equal to 1) + * + * \param to End of the interval. (could be also negative to index from the end) + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +template +constexpr auto slice(const ToType to) +{ + if constexpr(is_same_v) + { + return Slice(0, to); + } + else + { + return Slice, ToType>(Number<0>{}, to); + } +} + +/** + * \brief Get whole dim slice (from = 0, to = -1). + * + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +constexpr auto slice() { return Slice, Number<-1>>(Number<0>{}, Number<-1>{}); } + +} // namespace wrapper +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 89b8b9667f..dc47c7ec1a 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -86,9 +86,9 @@ using NHWGK = ck::tensor_layout::convolution::NHWGK; using NDHWGK = ck::tensor_layout::convolution::NDHWGK; // -using GK = ck::tensor_layout::convolution::G_K; -using GK_Tuple = ck::Tuple; -using GK_GK_Tuple = ck::Tuple; +using G_K = ck::tensor_layout::convolution::G_K; +using GK_Tuple = ck::Tuple; +using GK_GK_Tuple = ck::Tuple; // pointwise functor using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp index b43d34d69a..b67119ad19 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp @@ -61,7 +61,11 @@ using device_contraction_kk_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -96,7 +100,11 @@ using device_contraction_kn_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -131,7 +139,11 @@ using device_contraction_mk_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -166,7 +178,11 @@ using device_contraction_mn_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp similarity index 96% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp rename to library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp index 82eae9f0a2..005cec94ec 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp @@ -25,10 +25,6 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // Compilation parameters for a[m, k] * b[k, n] = c[m, n] template using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< @@ -37,7 +33,7 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< //#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | | //#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | //#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // pipeline v1, 1 wave + // pipeline v1, 1 wave DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 64, 16, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, @@ -75,7 +71,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Interwave, PipelineVersion::v1> #endif -#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES +#if 0 + //CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES // pipeline v2, 1 wave , DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v2>, @@ -98,17 +95,6 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< // clang-format on >; -void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); - - add_device_operation_instances( - instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp index bbc70f1a5b..626dd7f00a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -345,7 +345,11 @@ void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances( std::vector>>& instances); -void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances( +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances( std::vector>>& instances); @@ -575,7 +579,8 @@ struct DeviceOperationInstanceFactory< if constexpr(is_same_v && is_same_v && is_same_v) { - add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp index dc9f44dc86..efb6266426 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp @@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, BF16, BF16, @@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F16, F16, @@ -59,7 +59,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F32, F32, @@ -75,7 +75,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, int8_t, int8_t, @@ -130,7 +130,9 @@ struct DeviceOperationInstanceFactory> op_ptrs; if constexpr(NumDimSpatial == 3 && is_same_v && - is_same_v && is_same_v) + is_same_v && is_same_v && + DLayouts::Size() == 2 && is_same_v, NDHWGK> && + is_same_v, G_K>) { #ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index d0bcacbe3c..3532c3f4ba 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -101,7 +101,8 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES - device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp + device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp index 3c9e03b674..b3d1e925df 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp @@ -16,6 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp index aab0af990d..9c80995949 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp @@ -16,6 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp new file mode 100644 index 0000000000..baa76a74af --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp" + +#ifdef CK_ENABLE_FP8 +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp new file mode 100644 index 0000000000..f16809db28 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp" + +#ifdef CK_ENABLE_FP8 +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp index c6627a4825..7d2df94ad7 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, BF16, BF16, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp index 627af24d7b..8a09d03967 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F16, F16, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp index 1fd567e360..6966959639 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F32, F32, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp index dae292891c..2606f69428 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp @@ -12,7 +12,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, int8_t, int8_t, @@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -35,7 +35,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp index 711314985a..d46fe090b8 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp @@ -22,13 +22,13 @@ using S = ck::Sequence; using NHWGC = ck::tensor_layout::convolution::NHWGC; using GKYXC = ck::tensor_layout::convolution::GKYXC; using NHWGK = ck::tensor_layout::convolution::NHWGK; -using GK = ck::tensor_layout::convolution::G_K; +using G_K = ck::tensor_layout::convolution::G_K; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Relu = ck::tensor_operation::element_wise::Relu; using TanH = ck::tensor_operation::element_wise::TanH; -using GK_Tuple = ck::Tuple; -using GK_GK_Tuple = ck::Tuple; +using GK_Tuple = ck::Tuple; +using GK_GK_Tuple = ck::Tuple; using I32_Tuple = ck::Tuple; using F32_Tuple = ck::Tuple; using I32_F32_Tuple = ck::Tuple; diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp deleted file mode 100644 index c239a520d1..0000000000 --- a/profiler/src/profile_transpose.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_transpose_impl.hpp" -#include "profiler_operation_registry.hpp" - -enum struct MatrixLayout -{ - NCDHW, // 0 - NCHWD, // 1 -}; - -enum struct DataType -{ - F32_F32_F32_F32_F32, // 0 - F16_F16_F16_F16_F16, // 1 -}; - -#define OP_NAME "transpose" -#define OP_DESC "Transpose" - -int profile_transpose(int argc, char* argv[]) -{ - if(argc != 15) - { - printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); - printf("arg2: data type (0: fp32; 1: fp16)\n"); - // printf("arg3: matrix layout (NCDHW -> NDCHW);\n"); - printf("arg4: verification (0: no; 1: yes)\n"); - printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg6: print tensor value (0: no; 1: yes)\n"); - printf("arg7: time kernel (0=no, 1=yes)\n"); - printf("arg8 to 13: N, C, D, H, W\n"); - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - // const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[3]); - const int init_method = std::stoi(argv[4]); - const bool do_log = std::stoi(argv[5]); - const bool time_kernel = std::stoi(argv[6]); - std::vector lengths = std::stoi(argv[7]); - - /**const int N = std::stoi(argv[7]); - const int C = std::stoi(argv[8]); - const int D = std::stoi(argv[9]); - const int H = std::stoi(argv[10]); - const int W = std::stoi(argv[11]);**/ - - using F32 = float; - using F16 = ck::half_t; - - auto profile = [&](auto a_type, auto b_type) { - using ADataType = decltype(a_type); - using BDataType = decltype(b_type); - - bool pass = ck::profiler::profile_transpose_impl( - do_verification, init_method, do_log, time_kernel, lengths); - - return pass ? 0 : 1; - }; - - if(data_type == GemmDataType::F32_F32_F32_F32_F32) - { - return profile(F32{}, F32{}); - } - else if(data_type == GemmDataType::F16_F16_F16_F16_F16) - { - return profile(F16{}, F16{}); - } - else - { - std::cout << "this data_type & layout is not implemented" << std::endl; - - return 1; - } -} - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4aaa5fcfa5..b325a3a7f8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -149,6 +149,7 @@ add_subdirectory(batched_gemm_multi_d) add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(conv_tensor_rearrange) add_subdirectory(transpose) +add_subdirectory(wrapper) if(GPU_TARGETS MATCHES "gfx11") add_subdirectory(wmma_op) endif() diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt new file mode 100644 index 0000000000..6b25c08a8a --- /dev/null +++ b/test/wrapper/CMakeLists.txt @@ -0,0 +1,4 @@ +add_gtest_executable(test_layout test_layout.cpp) +target_link_libraries(test_layout PRIVATE utility) +add_gtest_executable(test_tensor test_tensor.cpp) +target_link_libraries(test_tensor PRIVATE utility) diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp new file mode 100644 index 0000000000..14a8b96462 --- /dev/null +++ b/test/wrapper/test_layout.cpp @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" + +class TestWrapperLayout : public ::testing::Test +{ + protected: + static constexpr auto I0 = ck::Number<0>{}; + static constexpr auto I1 = ck::Number<1>{}; + + template + void Run(Desc& desc, + Desc1d& desc_1d, + LayoutRuntime& layout_runtime, + LayoutCompiletime& layout_compiletime, + const std::vector& idxs) + { + // 1d check + EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime)); + // Check layout compiletime and runtime result consistency + EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime)); + + for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++) + { + const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i)); + const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i)); + const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i)); + EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d); + EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d); + } + // size(layout)-d check, don't check if access is hierarchical + if constexpr(!IsNestedTuple(Idxs{})) + { + ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) { + EXPECT_EQ(desc.GetLength(ck::Number{}), ck::wrapper::size(layout_runtime)); + EXPECT_EQ(ck::wrapper::size(layout_runtime), + ck::wrapper::size(layout_compiletime)); + }); + } + for(const auto idx : idxs) + { + const ck::index_t layout_runtime_offset = layout_runtime(idx); + const ck::index_t layout_compiletime_offset = layout_compiletime(idx); + const ck::index_t desc_offset = + desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested + EXPECT_EQ(layout_runtime_offset, desc_offset); + EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset); + } + } +}; + +TEST_F(TestWrapperLayout, 2d) +{ + // dims:(4, 3) strides:(1, 4) + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s1 = 1; + constexpr ck::index_t s0 = 4; + const auto desc = + ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0)); + const auto layout_compiletime = + ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{})); + std::vector> idxs; + + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs.emplace_back(h, w); + } + } + + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs); +} + +TEST_F(TestWrapperLayout, 3d_nested) +{ + // dims:((2, 3), 4, 3) strides:((2, 4), 12, 48) + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s3 = 2; + constexpr ck::index_t s2 = 4; + constexpr ck::index_t s1 = 12; + constexpr ck::index_t s0 = 48; + const auto desc = ck::make_naive_tensor_descriptor( + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))), + ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_3d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)), + ck::make_pass_through_transform(d1), + ck::make_pass_through_transform(d2)), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); + const auto layout_runtime = + ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0), + ck::make_tuple(ck::make_tuple(s3, s2), s1, s0)); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple( + ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}, ck::Number{}), + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::Number{}, + ck::Number{})); + std::vector> idxs_3d; + + for(ck::index_t d = 0; d < d2 * d3; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_3d.emplace_back(d, h, w); + } + } + } + this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d); + + // Check also 4d iteration + std::vector, ck::index_t, ck::index_t>> idxs_4d; + + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_4d.emplace_back(ck::make_tuple(e, d), h, w); + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d); +} + +TEST_F(TestWrapperLayout, 2d_nested) +{ + // dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12)) + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s3 = 2; + constexpr ck::index_t s2 = 4; + constexpr ck::index_t s1 = 48; + constexpr ck::index_t s0 = 12; + const auto desc = ck::make_naive_tensor_descriptor( + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))), + ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_2d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)), + ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); + const auto layout_runtime = + ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)), + ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0))); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})), + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + std::vector> idxs_2d; + + for(ck::index_t h = 0; h < d2 * d3; h++) + { + for(ck::index_t w = 0; w < d0 * d1; w++) + { + idxs_2d.emplace_back(h, w); + } + } + this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d); + // Check also 4d iteration + std::vector, ck::Tuple>> + idxs_4d; + + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w)); + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d); +} + +TEST_F(TestWrapperLayout, 3d_double_nested) +{ + // dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s4 = 2; + constexpr ck::index_t s3 = 4; + constexpr ck::index_t s2 = 8; + constexpr ck::index_t s1 = 96; + constexpr ck::index_t s0 = 24; + const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}), + ck::make_tuple(ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))), + ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_3d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)), + ck::make_pass_through_transform(d2), + ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); + const auto desc_2d = transform_tensor_descriptor( + desc_3d, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)), + ck::make_pass_through_transform(d1 * d0)), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)), + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0))); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})), + ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + std::vector> idxs_2d; + + for(ck::index_t h = 0; h < d2 * d3 * d4; h++) + { + for(ck::index_t w = 0; w < d0 * d1; w++) + { + idxs_2d.emplace_back(h, w); + } + } + this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d); + // Check also 3d iteration + std::vector, ck::index_t>> idxs_3d; + + for(ck::index_t d = 0; d < d3 * d4; d++) + { + for(ck::index_t h = 0; h < d2; h++) + { + for(ck::index_t w = 0; w < d1 * d0; w++) + { + idxs_3d.emplace_back(ck::make_tuple(d, h), w); + } + } + } + this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d); + // Check also 5d iteration + std::vector, ck::index_t>, + ck::Tuple>> + idxs_5d; + + for(ck::index_t f = 0; f < d4; f++) + { + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d), + ck::make_tuple(h, w)); + } + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d); +} + +TEST(TestLayoutHelpers, SizeAndGet) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0))); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + // Size of layout + EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0); + EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0); + + // Size of dims + EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2); + EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2); + EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0); + EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0); + + // Access through new layout (using get with layout object) + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2); + + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))), + d4); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))), + d3); + + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2); + + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0); +} + +TEST(TestLayoutHelpers, DepthAndRank) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0))); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3); + EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3); + EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2); + // Check for integer + EXPECT_EQ(ck::wrapper::depth(d0), 0); + + EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2); + EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2); + EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2); + // Check for integer + EXPECT_EQ(ck::wrapper::rank(d0), 1); +} + +TEST(TestLayoutHelpers, ShapeAndStrides) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s4 = 2; + constexpr ck::index_t s3 = 4; + constexpr ck::index_t s2 = 8; + constexpr ck::index_t s1 = 96; + constexpr ck::index_t s0 = 24; + const auto shape_compiletime = ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + const auto strides_compiletime = ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + const auto shape_runtime = + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)); + const auto strides_runtime = + ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0)); + const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime); + const auto layout_compiletime = + ck::wrapper::make_layout(shape_compiletime, strides_compiletime); + + constexpr bool check_compiletime_shape = + std::is_same_v>; + constexpr bool check_compiletime_strides = + std::is_same_v>; + constexpr bool check_runtime_shape = + std::is_same_v>; + constexpr bool check_runtime_strides = + std::is_same_v>; + EXPECT_TRUE(check_compiletime_shape); + EXPECT_TRUE(check_compiletime_strides); + EXPECT_TRUE(check_runtime_shape); + EXPECT_TRUE(check_runtime_strides); +} + +TEST(TestLayoutHelpers, Hierarchical) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto runtime_shape = + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)); + const auto layout_runtime = ck::wrapper::make_layout(runtime_shape); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2); + EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2); + EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2); + + EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1); + EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1); + EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1); + + EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3); + EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3); + EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3); + + EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4); +} diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp new file mode 100644 index 0000000000..92f8e2e1bd --- /dev/null +++ b/test/wrapper/test_tensor.cpp @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/library/utility/device_memory.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" + +// Compare data in tensor with offset from layout. +// Data and offset should match if physical memory has been initialized with +// sequentially increasing values from 0. +template +__host__ __device__ bool TestTensorCheck3d(TensorType& tensor) +{ + const auto& layout = ck::wrapper::layout(tensor); + for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++) + { + for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++) + { + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) + { + const auto idx = ck::make_tuple(ck::make_tuple(d, h), w); + if(tensor(idx) != layout(idx)) + { + return false; + } + } + } + } + return true; +} + +template +__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0) +{ + const auto& layout = ck::wrapper::layout(tensor); + for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++) + { + if(tensor(w) - start_offset != layout(ck::make_tuple(w))) + { + return false; + } + } + return true; +} + +template +__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor) +{ + const auto& layout = ck::wrapper::layout(tensor); + bool success = true; + ck::static_for<0, nelems, 1>{}([&](auto w) { + if(tensor(ck::Number{}) != layout(ck::make_tuple(w.value))) + { + success = false; + } + }); + return success; +} + +template +__host__ __device__ void InitTensor(TensorType& tensor) +{ + for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++) + { + tensor(i) = i; + } +} + +template +__host__ __device__ void StaticInitTensor(TensorType& tensor) +{ + + ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number{}) = i.value; }); +} + +// Tests +TEST(TestTensor, ReadWriteHostMemory) +{ + constexpr ck::index_t nelems = 8; + + std::array data; + const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); + auto tensor = ck::wrapper::make_tensor(&data[0], layout); + InitTensor(tensor); + + EXPECT_TRUE(TestTensorCheck1d(tensor)); + EXPECT_TRUE(TestTensorCheck3d(tensor)); +} + +__global__ void TestTensorReadWriteDevice(void* data, void* success) +{ + constexpr ck::index_t nelems = 8; + constexpr ck::index_t scalar_per_vector = 1; + __shared__ ck::index_t p_shared[nelems]; + + ck::index_t* casted_data_ptr = static_cast(data); + bool* casted_success_ptr = static_cast(success); + + const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); + constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{})); + + auto tensor_global = + ck::wrapper::make_tensor(casted_data_ptr, layout); + auto tensor_lds = ck::wrapper::make_tensor(p_shared, layout); + auto tensor_vgpr = ck::wrapper::make_register_tensor(register_layout); + auto tensor_sgpr = ck::wrapper::make_register_tensor(register_layout); + + InitTensor(tensor_global); + InitTensor(tensor_lds); + StaticInitTensor(tensor_vgpr); + StaticInitTensor(tensor_sgpr); + + *casted_success_ptr &= TestTensorCheck1d(tensor_global); + *casted_success_ptr &= TestTensorCheck3d(tensor_global); + + *casted_success_ptr &= TestTensorCheck1d(tensor_lds); + *casted_success_ptr &= TestTensorCheck3d(tensor_lds); + + *casted_success_ptr &= StaticTestTensorCheck1d(tensor_vgpr); + + *casted_success_ptr &= StaticTestTensorCheck1d(tensor_sgpr); +} + +TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) +{ + constexpr ck::index_t nelems = 8; + std::array host_data; + + DeviceMem data_buf(nelems * sizeof(ck::index_t)); + data_buf.ToDevice(&host_data[0]); + DeviceMem success_buf(sizeof(bool)); + + launch_and_time_kernel(StreamConfig{}, + TestTensorReadWriteDevice, + dim3(1), + dim3(1), + nelems * sizeof(ck::index_t), + data_buf.GetDeviceBuffer(), + success_buf.GetDeviceBuffer()); + + bool success; + success_buf.FromDevice(&success); + EXPECT_TRUE(success); +} + +TEST(TestTensor, Slicing) +{ + constexpr ck::index_t nelems = 8; + + std::array data; + const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2); + const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4); + const auto layout = ck::wrapper::make_layout(shape, strides); + auto tensor = ck::wrapper::make_tensor(&data[0], layout); + InitTensor(tensor); + + auto tensor2x2x2 = + tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2); + EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2); + EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2)); + + auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2); + EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2); + EXPECT_EQ(ck::wrapper::size(tensor2x2), 4); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + + auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2)); + EXPECT_EQ(rank(tensor1x1), 2); + EXPECT_EQ(depth(tensor1x1), 2); + EXPECT_EQ(size(tensor1x1), 1); + EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1)))); + + auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2)); + EXPECT_EQ(ck::wrapper::rank(tensor2), 1); + EXPECT_EQ(ck::wrapper::depth(tensor2), 1); + EXPECT_EQ(ck::wrapper::size(tensor2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0)))); + + // negative indexing + auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); + EXPECT_EQ(rank(tensor1x2), 2); + EXPECT_EQ(depth(tensor1x2), 2); + EXPECT_EQ(size(tensor1x2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); +}