From 81ec5eff4a3cb64c6681043593862016193797d1 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Wed, 20 Nov 2024 23:03:56 +0800 Subject: [PATCH 01/52] fix bug (#1680) --- .../device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp | 4 ++-- .../device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp | 4 ++-- .../device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp index b1b64ca853..9555dffd2f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp @@ -41,7 +41,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -69,7 +69,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std: //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<2, 2, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, S<4, 4, 1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp index 658714d359..8666cf8589 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp @@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#ifdef __gfx94__ +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) //Only enable these instances on gfx94x // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -67,7 +67,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Row, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp index 382ed5b5a2..f5e801c167 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp @@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Compute friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 64, 16, 16, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, @@ -68,7 +68,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple< //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) +#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) // Latency friendly DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, F8, F8, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>, From d31e8249c1be17aaada2a8e29df1c6495dc709f4 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 20 Nov 2024 14:01:04 -0800 Subject: [PATCH 02/52] Optimize docker file. (#1679) * reduce the docker image size and layers * clean up docker file * fix linker error for client example 24 * install CK into the default /opt/rocm/ path * restore installing CK to alternative path in CI * add linking for utility lib --- Dockerfile | 91 +++++++------------ .../24_grouped_conv_activation/CMakeLists.txt | 4 +- client_example/CMakeLists.txt | 2 +- 3 files changed, 35 insertions(+), 62 deletions(-) diff --git a/Dockerfile b/Dockerfile index 791d1d9f3a..b06726335a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,18 +4,14 @@ ARG ROCMVERSION=6.2 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" - -RUN set -xe - ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ -RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins -# Add rocm repository -RUN chmod 1777 /tmp -RUN apt-get update -RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl - ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn -RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg + +# Add rocm repository +RUN set -xe && \ + useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \ + apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ + curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN if [ "$ROCMVERSION" != "6.3" ]; then \ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ @@ -30,8 +26,8 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \ amdgpu-repo --amdgpu-build=2074281; \ fi -RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" -RUN amdgpu-install -y --usecase=rocm --no-dkms +RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ + amdgpu-install -y --usecase=rocm --no-dkms ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache @@ -76,66 +72,49 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- clang-format-12 \ kmod && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + rm -rf amdgpu-install* && \ +# Remove unnecessary rocm components that take a lot of space + apt-get remove -y rocblas rocfft rocsparse composablekernel-dev # hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1 RUN if [ "$ROCMVERSION" = "6.1" ]; then \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \ fi # Update the cmake to version 3.27.5 -RUN pip install --upgrade cmake==3.27.5 - +RUN pip install --upgrade cmake==3.27.5 && \ #Install latest ccache -RUN git clone https://github.com/ccache/ccache.git && \ - cd ccache && mkdir build && cd build && cmake .. && make install - + git clone https://github.com/ccache/ccache.git && \ + cd ccache && mkdir build && cd build && cmake .. && make install && \ #Install ninja build tracing tools -RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip -RUN gunzip /usr/local/bin/ninja.gz -RUN chmod a+x /usr/local/bin/ninja -RUN git clone https://github.com/nico/ninjatracing.git - + wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \ + gunzip /usr/local/bin/ninja.gz && \ + chmod a+x /usr/local/bin/ninja && \ + git clone https://github.com/nico/ninjatracing.git && \ #Install latest cppcheck -RUN git clone https://github.com/danmar/cppcheck.git && \ + git clone https://github.com/danmar/cppcheck.git && \ cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . WORKDIR / -# Setup ubsan environment to printstacktrace -RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer -ENV UBSAN_OPTIONS=print_stacktrace=1 - # Install an init system -RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb -RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb - -ARG PREFIX=/opt/rocm +RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ + dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results -RUN pip3 install --upgrade pip -RUN pip3 install sqlalchemy==1.4.46 -RUN pip3 install pymysql -RUN pip3 install pandas==2.0.3 -RUN pip3 install setuptools-rust -RUN pip3 install sshtunnel==0.4.0 -# Setup ubsan environment to printstacktrace -ENV UBSAN_OPTIONS=print_stacktrace=1 - -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 -RUN groupadd -f render - + pip3 install --upgrade pip && \ + pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \ +# Add render group + groupadd -f render && \ # Install the new rocm-cmake version -RUN git clone -b master https://github.com/ROCm/rocm-cmake.git && \ - cd rocm-cmake && mkdir build && cd build && \ - cmake .. && cmake --build . && cmake --build . --target install + git clone -b master https://github.com/ROCm/rocm-cmake.git && \ + cd rocm-cmake && mkdir build && cd build && \ + cmake .. && cmake --build . && cmake --build . --target install WORKDIR / - +# Add alternative compilers, if necessary ENV compiler_version=$compiler_version ENV compiler_commit=$compiler_commit -RUN sh -c "echo compiler version = '$compiler_version'" -RUN sh -c "echo compiler commit = '$compiler_commit'" - -ARG DISABLE_CACHE=0 +RUN sh -c "echo compiler version = '$compiler_version'" && \ + sh -c "echo compiler commit = '$compiler_commit'" RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ @@ -152,9 +131,3 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd make -j 8 ; \ else echo "using the release compiler"; \ fi - -#clean-up the deb package -RUN sh -c "rm -rf amdgpu-install*" - -#ENV HIP_CLANG_PATH='/llvm-project/build/bin' -#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'" diff --git a/client_example/24_grouped_conv_activation/CMakeLists.txt b/client_example/24_grouped_conv_activation/CMakeLists.txt index dc55250bfe..67bbdfec45 100644 --- a/client_example/24_grouped_conv_activation/CMakeLists.txt +++ b/client_example/24_grouped_conv_activation/CMakeLists.txt @@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_reduction_operations - utility) + composable_kernel::utility) # Fwd convscale + AMAX add_executable(client_conv3d_fwd_convscale_amax_fp8 grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp) @@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8 PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_reduction_operations - utility) + composable_kernel::utility) # Fwd convscale add_executable(client_conv3d_fwd_convscale_fp8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp) diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index acb57d7bb0..c393972b42 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -62,7 +62,7 @@ else() set(CK_USE_WMMA "ON") endif() -find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations) +find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations utility) if(GPU_TARGETS MATCHES "gfx9") find_package(composable_kernel COMPONENTS device_contraction_operations) endif() From 6916d8cc033543d1ea2028215d75409e11813dd9 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Thu, 21 Nov 2024 14:49:13 +0800 Subject: [PATCH 03/52] Add QianFeng to code owners (#1682) --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 459315e58b..5340be274b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk # Documentation files -docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk -.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk # Header directory for Doxygen documentation -library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk +library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk From fb1ccfa9df534c8c9f351dd959a0ff692d6f9210 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Thu, 21 Nov 2024 14:53:10 +0800 Subject: [PATCH 04/52] [CK_TILE] Add paged-kvcache support in group mode fmha fwd splitkv kernels (#1678) * Generate group mode paged-attn kernel * Enable paged-kvcache + group mode support * Add missing header: fused_moe.hpp * Add comment to explain kernel arg usage * Make error message more clear * Add comment for confusing data member names * Add more comment for confusing variable names * Fix typo in option description --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 3 - example/ck_tile/01_fmha/fmha_fwd.cpp | 61 ++++++++++++------- example/ck_tile/01_fmha/fmha_fwd.hpp | 10 ++- example/ck_tile/01_fmha/utils.hpp | 4 +- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 49 ++++++++++----- include/ck_tile/ops/fused_moe.hpp | 11 ++++ 6 files changed, 95 insertions(+), 43 deletions(-) create mode 100644 include/ck_tile/ops/fused_moe.hpp diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index b084e9d0fc..d1da951567 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -655,9 +655,6 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> if pipeline.F_spad != 't' or pipeline.F_skpad != 't': # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not continue - if pipeline.F_pagedkv == 't': - # we only use batch mode kernels to handle (paged-) kvcache problems - continue k = Kernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 14291715fb..00e0a16536 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -62,7 +62,7 @@ auto create_args(int argc, char* argv[]) "-1 to choose s_knew in [1, s] randomly.") .insert("s_kpad", "-1", - "seqlen_k stride between 2 tokens, currently used in group-mode only\n" + "seqlen_k stride between 2 batches, currently used in group-mode only\n" "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n" "along seqlen, instead of packed. same as xformer kv_padding") .insert("d", "128", "head dim for q, k") @@ -294,7 +294,8 @@ bool run(const ck_tile::ArgParser& arg_parser) #if !CK_TILE_FMHA_FWD_APPENDKV_API if(seqlen_knew != 0) { - std::cerr << "kvcache is not supported. ignoring the 's_knew' option" << std::endl; + std::cerr << "fmha_fwd_appendkv() is not enabled. ignoring the 's_knew' option" + << std::endl; seqlen_knew = 0; } #endif @@ -321,6 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser) rotary_dim = 0; } #endif + // to use fmha_fwd_appendkv(), make sure it's in batch mode + const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim); + if(need_append_kvcache && mode == mode_enum::group) + { + std::cerr << "fmha_fwd_appendkv() will be invoked. ignoring the 'mode' option" << std::endl; + mode = mode_enum::batch; + } if(!(rotary_dim <= hdim_q)) { std::cerr << "rotary_dim should be less than or equal to head dim for q" << std::endl; @@ -356,22 +364,26 @@ bool run(const ck_tile::ArgParser& arg_parser) << std::endl; use_cache_batch_idx = false; } +#else + if(use_cache_batch_idx) + { + if(0 < page_block_size) + { + std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the " + "'cache_batch_idx' option" + << std::endl; + use_cache_batch_idx = false; + } + else if(mode == mode_enum::group) + { + std::cerr << "group mode will not use cache_batch_idx. ignoring the " + "'cache_batch_idx' option" + << std::endl; + use_cache_batch_idx = false; + } + } #endif - if(0 < page_block_size && use_cache_batch_idx) - { - std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the " - "'cache_batch_idx' option" - << std::endl; - use_cache_batch_idx = false; - } - // the input tensor layout for kvcache is same as batch mode - const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim); const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size); - if(use_kvcache && mode != mode_enum::batch) - { - std::cerr << "kvcache enabled. ignoring the 'mode' option" << std::endl; - mode = mode_enum::batch; - } auto [seqlen_qs, seqlen_ks, seqlen_kpads] = decode_seqlen(mode, @@ -380,7 +392,7 @@ bool run(const ck_tile::ArgParser& arg_parser) arg_parser.get_str("s_k"), arg_parser.get_str("s_kpad"), /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0, - use_kvcache); + need_append_kvcache); // compute kvcache seqlen_k (before appending knew/vnew) auto cache_seqlen_ks = seqlen_ks; std::transform(cache_seqlen_ks.begin(), @@ -741,8 +753,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t)); ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t)); - ck_tile::DeviceMem seqlen_k_buf( - use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.size() * sizeof(int32_t) : 0); + ck_tile::DeviceMem seqlen_k_buf((mode == mode_enum::batch && use_kvcache) || + 0 <= seqlen_kpads[0] + ? seqlen_ks.size() * sizeof(int32_t) + : 0); ck_tile::DeviceMem cache_seqlen_k_buf( need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0); ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes()); @@ -763,7 +777,9 @@ bool run(const ck_tile::ArgParser& arg_parser) seqstart_q.ToDevice(seqstart_q_host.data()); seqstart_k.ToDevice(seqlen_kpads[0] < 0 ? seqstart_k_host.data() : seqstart_k_with_padding_host.data()); - seqlen_k_buf.ToDevice(use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.data() : nullptr); + seqlen_k_buf.ToDevice((mode == mode_enum::batch && use_kvcache) || 0 <= seqlen_kpads[0] + ? seqlen_ks.data() + : nullptr); cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr); rotary_cos_buf.ToDevice(rotary_cos_host.data()); rotary_sin_buf.ToDevice(rotary_sin_host.data()); @@ -976,8 +992,9 @@ bool run(const ck_tile::ArgParser& arg_parser) (mode == mode_enum::group ? seqstart_q.GetDeviceBuffer() : nullptr); args.seqstart_k_ptr = (mode == mode_enum::group ? seqstart_k.GetDeviceBuffer() : nullptr); - args.seqlen_k_ptr = - (use_kvcache || 0 <= k_paddings_[0] ? seqlen_k_buf.GetDeviceBuffer() : nullptr); + args.seqlen_k_ptr = ((mode == mode_enum::batch && use_kvcache) || 0 <= k_paddings_[0] + ? seqlen_k_buf.GetDeviceBuffer() + : nullptr); args.seqlen_k = shape_seqlen_k; // unused in group mode (or kvcache enabled) args.max_seqlen_q = max_seqlen_q; diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 251e61bc76..41edac67ba 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -173,8 +173,11 @@ struct fmha_fwd_splitkv_args // seqlen_k = kargs.seqlen_k // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] - // kvcache mode (use same kernel as batch mode): + // batch mode (kvcache): // seqlen_q = kargs.seqlen_q + // seqlen_k = kargs.seqlen_k_ptr[b] + // group mode (kvcache): + // seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] const void* seqstart_q_ptr; const void* seqstart_k_ptr; @@ -251,7 +254,7 @@ struct fmha_fwd_appendkv_args ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr - const void* cache_batch_idx; + const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache) ck_tile::index_t stride_q; ck_tile::index_t stride_k; @@ -389,6 +392,9 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) args.nhead_q, args.nhead_q / args.nhead_k, args.num_splits, + args.block_table_ptr, + args.batch_stride_block_table, + args.page_block_size, args.scale_s, args.scale_p, args.stride_q, diff --git a/example/ck_tile/01_fmha/utils.hpp b/example/ck_tile/01_fmha/utils.hpp index 996032a717..faf3f08437 100644 --- a/example/ck_tile/01_fmha/utils.hpp +++ b/example/ck_tile/01_fmha/utils.hpp @@ -145,7 +145,7 @@ decode_seqlen(mode_enum mode, std::string k_val, std::string k_pad_val, ck_tile::index_t seqlen_k_min = 0, - bool use_kvcache = false, + bool need_append_kvcache = false, std::optional seed = std::nullopt) { #define _S2I_(str_) static_cast(std::atoi((str_).c_str())) @@ -159,7 +159,7 @@ decode_seqlen(mode_enum mode, const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k); std::vector seqlen_ks(batch, seqlen_k_max); - if(1 < batch && use_kvcache) + if(1 < batch && need_append_kvcache) { // to keep the original s_k value, we always use seqlen_k_max in first batch randints(std::next(seqlen_ks.begin()), diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 4ffebc3c9c..98a4329d75 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -46,8 +46,7 @@ struct FmhaFwdSplitKVKernel static constexpr auto BiasEnum = FmhaPipeline::BiasEnum; static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant; static constexpr bool kIsPagedKV = FmhaPipeline::Problem::kIsPagedKV; - static_assert(!kIsGroupMode || (kIsGroupMode && !kIsPagedKV), - "paged-kvcache only supported by batch mode kernels"); + using FmhaMask = ck_tile::remove_cvref_t; static constexpr bool kHasMask = FmhaMask::IsMasking; @@ -198,8 +197,10 @@ struct FmhaFwdSplitKVKernel const int32_t* seqlen_k_ptr; ck_tile::index_t batch_stride_q; - ck_tile::index_t batch_stride_k; - ck_tile::index_t batch_stride_v; + ck_tile::index_t batch_stride_k; // when using paged-kvcache, this will be stride/size for + // single kcache page-block + ck_tile::index_t batch_stride_v; // when using paged-kvcache, this will be stride/size for + // single vcache page-block ck_tile::index_t batch_stride_lse_acc; ck_tile::index_t batch_stride_o_acc; }; @@ -212,14 +213,17 @@ struct FmhaFwdSplitKVKernel AlibiKargs, EmptyKargs<0>>>, std::conditional_t>, - std::conditional_t> + std::conditional_t>, + std::conditional_t> { const int32_t* seqstart_q_ptr; const int32_t* seqstart_k_ptr; const int32_t* seqlen_k_ptr; - ck_tile::index_t batch_stride_k; // only used for paged-kvcache - ck_tile::index_t batch_stride_v; // only used for paged-kvcache + ck_tile::index_t batch_stride_k; // only used for paged-kvcache, this will be stride/size + // for single kcache page-block + ck_tile::index_t batch_stride_v; // only used for paged-kvcache, this will be stride/size + // for single vcache page-block }; using Kargs = std::conditional_t; @@ -363,6 +367,9 @@ struct FmhaFwdSplitKVKernel ck_tile::index_t num_head_q, ck_tile::index_t nhead_ratio_qk, ck_tile::index_t num_splits, + const void* block_table_ptr, + ck_tile::index_t batch_stride_block_table, + ck_tile::index_t page_block_size, float scale_s, float scale_p, ck_tile::index_t stride_q, @@ -416,6 +423,7 @@ struct FmhaFwdSplitKVKernel {}, // placeholder for bias {}, // placeholder for mask {}, // placeholder for fp8_static_quant args + {}, // placeholder for paged-block table reinterpret_cast(seqstart_q_ptr), reinterpret_cast(seqstart_k_ptr), reinterpret_cast(seqlen_k_ptr), @@ -443,6 +451,12 @@ struct FmhaFwdSplitKVKernel { kargs.scale_p = scale_p; } + if constexpr(kIsPagedKV) + { + kargs.block_table_ptr = reinterpret_cast(block_table_ptr); + kargs.batch_stride_block_table = batch_stride_block_table; + kargs.page_block_size = page_block_size; + } return kargs; } @@ -489,15 +503,22 @@ struct FmhaFwdSplitKVKernel const long_index_t key_start = kargs.seqstart_k_ptr[i_batch]; batch_offset_q = query_start * kargs.stride_q; - batch_offset_k = key_start * kargs.stride_k; - - if constexpr(std::is_same_v) + if constexpr(kIsPagedKV) { - batch_offset_v = key_start * kargs.stride_v; + batch_offset_k = static_cast(i_batch) * kargs.batch_stride_k; + batch_offset_v = static_cast(i_batch) * kargs.batch_stride_v; } else { - batch_offset_v = key_start; + batch_offset_k = key_start * kargs.stride_k; + if constexpr(std::is_same_v) + { + batch_offset_v = key_start * kargs.stride_v; + } + else + { + batch_offset_v = key_start; + } } if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) { @@ -685,7 +706,7 @@ struct FmhaFwdSplitKVKernel return make_page_block_navigator( kargs.k_ptr, - kargs.batch_stride_k, + kargs.batch_stride_k, // kcache page-block stride/size fixed_offset, block_indices, num_blocks, @@ -715,7 +736,7 @@ struct FmhaFwdSplitKVKernel return make_page_block_navigator( kargs.v_ptr, - kargs.batch_stride_v, + kargs.batch_stride_v, // vcache page-block stride/size fixed_offset, block_indices, num_blocks, diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp new file mode 100644 index 0000000000..b74607f061 --- /dev/null +++ b/include/ck_tile/ops/fused_moe.hpp @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" +#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" From d6d4c2788bc66c7ead56f1d7b03b7c7b28c2b007 Mon Sep 17 00:00:00 2001 From: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:21:37 -0800 Subject: [PATCH 05/52] universal streamk fp8 changes (#1665) * universal streamk fp8 changes & ckprofiler instances * revert strides to -1 and verification options * fp8 exclusion on pre-gfx94 for universal_streamk * PR review based revisions: permissions reverted, removed hip err checks --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- README.md | 3 +- example/01_gemm/CMakeLists.txt | 3 + example/01_gemm/common.hpp | 2 +- example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp | 13 +- example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp | 58 ++ .../01_gemm/run_gemm_example_streamk_v2.inc | 40 + .../device_gemm_xdl_cshuffle_streamk_v3.hpp | 382 ++++++-- .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 816 ++++++++++++++++-- .../gpu/gemm_universal_streamk.hpp | 315 +++++++ .../gpu/CMakeLists.txt | 6 + .../gpu/gemm_universal_streamk/CMakeLists.txt | 45 +- ..._universal_streamk_f16_f8_f16_mk_kn_mn.hpp | 84 ++ ..._f8_f16_mk_kn_mn_comp_default_instance.cpp | 24 + ...f8_f16_mk_kn_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 24 + ...8_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 24 + ...8_f16_mk_kn_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...8_f16_mk_kn_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f16_f8_f16_mk_nk_mn.hpp | 90 ++ ..._f8_f16_mk_nk_mn_comp_default_instance.cpp | 24 + ...f8_f16_mk_nk_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 24 + ...8_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 24 + ...8_f16_mk_nk_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...8_f16_mk_nk_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f8_f16_f16_mk_kn_mn.hpp | 85 ++ ...f16_f16_mk_kn_mn_comp_default_instance.cpp | 24 + ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 24 + ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 24 + ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 25 + ..._universal_streamk_f8_f16_f16_mk_nk_mn.hpp | 90 ++ ...f16_f16_mk_nk_mn_comp_default_instance.cpp | 24 + ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp | 24 + ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 24 + ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 24 + ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 25 + ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp | 25 + ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 25 + ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 25 + .../gemm_universal_streamk/CMakeLists.txt | 26 - ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp | 91 -- ...f16_f16_mk_kn_mn_comp_default_instance.cpp | 30 - ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp | 30 - ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp | 30 - ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp | 30 - ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp | 31 - ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 31 - ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 31 - ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp | 31 - ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 31 - ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 31 - ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp | 98 --- ...f16_f16_mk_nk_mn_comp_default_instance.cpp | 30 - ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp | 30 - ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp | 30 - ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 30 - ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp | 31 - ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 31 - ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 31 - ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp | 31 - ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 31 - ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 31 - modified_files.txt | 10 + .../src/profile_gemm_universal_streamk.cpp | 24 +- 80 files changed, 2886 insertions(+), 991 deletions(-) create mode 100755 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp mode change 100644 => 100755 example/01_gemm/run_gemm_example_streamk_v2.inc mode change 100644 => 100755 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp mode change 100644 => 100755 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp create mode 100755 modified_files.txt mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp diff --git a/README.md b/README.md index 302173dc17..d8eb152ee9 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,7 @@ Additional cmake flags can be used to significantly speed-up the build: other platforms have faster instances, such as `xdl` or `wmma`, available. * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances, - such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not - have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on + such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on architectures like the MI100/MI200 for the functional support only. ## Using sccache for building diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 52c8ab5806..957acce165 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -77,6 +77,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) +add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3) + add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index 6e1c9f2a0d..67bf92bbbc 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -44,7 +44,7 @@ struct ProblemSizeStreamK final ck::index_t StrideB = -1; ck::index_t StrideC = -1; - ck::index_t NumSKBlocks = -1; + ck::index_t NumSKBlocks = -1; // number of stream-k blocks }; struct ProblemSizeStreamK_universal final { diff --git a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp index 5b163962b9..36ac51f1da 100644 --- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp @@ -8,7 +8,7 @@ using ADataType = ck::half_t; using BDataType = ck::half_t; using AccDataType = float; -using CShuffleDataType = ck::half_t; +using CShuffleDataType = float; using CDataType = ck::half_t; using ALayout = Row; @@ -43,6 +43,17 @@ using DeviceGemmV2_Streamk_Instance = using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example_streamk_v2.inc" int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp new file mode 100755 index 0000000000..3b79ae9b85 --- /dev/null +++ b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +using ADataType = ck::f8_t; +using BDataType = ck::f8_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using CDataType = ck::half_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// clang-format off +using DeviceGemmV2_Streamk_Instance = + ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3< + ALayout, BLayout, CLayout, + ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, + PassThrough, PassThrough, PassThrough, GemmDefault, + 256, + 128, 256, + 128, 16, 16, + 16, 16, + 4, 8, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 16, 16, 1, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, + 2, 16, 16, 1, + 1, 2, S<1, 32, 1, 8>, 8, + ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + +#include "run_gemm_example_streamk_v2.inc" + +int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); } diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc old mode 100644 new mode 100755 index 8ed8b81bec..04243b8291 --- a/example/01_gemm/run_gemm_example_streamk_v2.inc +++ b/example/01_gemm/run_gemm_example_streamk_v2.inc @@ -176,6 +176,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -196,6 +197,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) * + c_m_n_device_ref_result.mDesc.GetElementSpaceSize()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -240,6 +243,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) return true; } + std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); + if(workspace_size != 0) + { + workspace.Realloc(workspace_size); + gemm.SetWorkSpacePointer(&argument, workspace.GetDeviceBuffer()); + } + bool pass = true; if((config.do_verification == 1) || (config.do_verification == 3)) { @@ -271,6 +281,36 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) #endif } + if((config.do_verification == 2) || (config.do_verification == 3)) + { + // GPU verification + auto ref_gemm_gpu = ReferenceGemmInstanceGPU{}; + auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker(); + + auto ref_argument_gpu = ref_gemm_gpu.MakeArgument( + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_ref_buf.GetDeviceBuffer()), + M, + N, + K, + a_element_op, + b_element_op, + c_element_op); + + std::cout << "Running verification on GPU." << std::endl; + ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{}); + + c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data()); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= ck::utils::check_err(c_m_n_device_result, + c_m_n_device_ref_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + } + if(config.time_kernel) { ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100644 new mode 100755 index 452063156e..cfd9a12047 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp @@ -131,6 +131,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2 0) { arg.Print(); @@ -147,26 +148,27 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2(arg.p_workspace_) + + arg.block_2_ctile_map_streamk.get_workspace_size_for_acc( + sizeof(GemmAccDataType)); + auto preprocess = [&]() { + hipMemsetAsync( + workspace_semaphore, + 0, + // sizeof(uint32_t), + arg.block_2_ctile_map_streamk.get_workspace_size_for_semaphore(), + stream_config.stream_id_); + }; + + ave_time = launch_and_time_kernel_with_preprocess( + stream_config, preprocess, kernel, grid_dim, dim3(BlockSize), 0, arg); + } } }; @@ -211,14 +236,12 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + + Run(kernel); } // Tail number could be One to Seven else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) @@ -340,53 +363,49 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } - else - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3_2lds; - Run(kernel); - } + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + Run(kernel); } } else { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) { - if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3; - Run(kernel); - } - else - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3; - Run(kernel); - } + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); } } } @@ -396,14 +415,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; - Run(kernel); - } + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + Run(kernel); } } @@ -418,6 +434,29 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2(pArg); + if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + return p_arg->block_2_ctile_map_streamk.get_workspace_size(sizeof(GemmAccDataType)); + } + else + { + return 0; + } + } + + void SetWorkSpacePointer(BaseArgument* pArg, + void* p_workspace, + const StreamConfig& = StreamConfig{}) const override + { + Argument* pArg_ = dynamic_cast(pArg); + + pArg_->p_workspace_ = p_workspace; + } + static constexpr bool IsValidCompilationParameter() { // TODO: properly implement this check @@ -464,8 +503,205 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2; + calculate_grid_size(kernel); + } + // Tail number could be One to Seven + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + } + // Tail number could be Odd or Even + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4) + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + calculate_grid_size(kernel); + } + else + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3_2lds; + calculate_grid_size(kernel); + } + } + else + { + + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + else + { + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + } + else + { + // Tail number always 1 + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + + const auto kernel = kernel_gemm_xdl_cshuffle_v3; + calculate_grid_size(kernel); + } + } + + return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size}; } static auto MakeInvoker() { return Invoker{}; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100644 new mode 100755 index ff10215353..6ef35da485 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp @@ -14,6 +14,8 @@ #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/workgroup_barrier.hpp" +#include "ck/utility/reduction_functions_accumulate.hpp" namespace ck { @@ -38,7 +40,7 @@ __global__ void __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( - karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg); + karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg, karg.p_workspace_); #else ignore = karg; #endif // end of if (defined(__gfx9__)) @@ -62,7 +64,13 @@ __global__ void __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run_2Lds( - karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg); + karg.p_a_grid, + karg.p_b_grid, + karg.p_c_grid, + p_shared_0, + p_shared_1, + karg, + karg.p_workspace_); #else ignore = karg; #endif // end of if (defined(__gfx9__)) @@ -521,7 +529,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, Streamk_sel_, Grid_size_}, p_a_grid{p_a_grid_}, p_b_grid{p_b_grid_}, - p_c_grid{p_c_grid_} + p_c_grid{p_c_grid_}, + block_2_ctile_map_streamk( + M_, N_, AK0Number * CalculateKPadded(K_, 1), Grid_size_, Streamk_sel_) { } @@ -529,6 +539,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 const ADataType* p_a_grid; const BDataType* p_b_grid; CDataType* p_c_grid; + BlockToCTileMap_GemmStreamK_v2 + block_2_ctile_map_streamk; }; struct SplitKBatchOffset @@ -853,6 +870,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock; } + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle() + { + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl); + + return make_naive_tensor_descriptor_packed( + make_tuple(Number{}, + Number{}, + Number{}, + Number{})); + } + using BlockwiseGemmPipe = remove_cvref_t(); + constexpr auto NPerBlockReduction = + NPerBlockPow2 / CShuffleBlockTransferScalarPerVector_NPerBlock; + constexpr auto MPerBlockReduction = + (BlockSize + NPerBlockReduction - 1) / NPerBlockReduction; + return Sequence{}; + } + + __host__ __device__ static constexpr auto GetPartialAccBlockDescriptor() + { + const auto c_partial_acc_block_m_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock), + make_tuple(NPerBlock, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock), + make_tuple(I1, MPerBlock)); + } + }(); + return c_partial_acc_block_m_n; + } using Block2CTileMap_streamk = BlockToCTileMap_GemmStreamK_v2( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, problem.N, AK0Number * problem.KPadded, problem.Grid_size, problem.Streamk_sel); uint32_t iter_start, iter_end; - bool is_sk_block, is_dp_block; + bool is_sk_block, is_dp_block, is_reduction_block; index_t num_k_block_main_loop; + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + uint32_t* p_semaphore = reinterpret_cast( + reinterpret_cast(p_workspace) + + block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType))); for(auto block_idx = get_block_1d_id(); block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx += gridDim.x) @@ -1163,6 +1241,214 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); num_k_block_main_loop = iter_end - iter_start; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + is_reduction_block = static_cast(block_idx) >= + block_2_ctile_map_streamk.reduction_start_block_idx; + if(is_reduction_block) + { + // descriptors + constexpr auto cluster_length_reduce = GetClusterLengthReduction(); + constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce); + const auto reduce_thread_cluster_idx = + reduce_desc.CalculateBottomIndex(make_multi_index(block_idx)); + const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0]; + const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1]; + + constexpr auto MReduceIters = math::integer_divide_ceil( + Number{}, cluster_length_reduce.At(I0)); + constexpr auto NReduceIters = math::integer_divide_ceil( + Number{}, + cluster_length_reduce.At(I1) * + Number{}); + + constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{})); + constexpr auto acc_thread_buf_store_desc = + make_naive_tensor_descriptor_packed(make_tuple( + I1, I1, I1, Number{})); + + constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor(); + + constexpr auto partial_acc_load_step_n = + make_multi_index(0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_n_reverse = make_multi_index( + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_m = + make_multi_index(cluster_length_reduce.At(I0), 0); + + constexpr auto partial_acc_store_step_n = + make_multi_index(0, + 0, + 0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_n_reverse = make_multi_index( + 0, + 0, + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_m = + make_multi_index(0, cluster_length_reduce.At(I0), 0, 0); + + StaticBuffer + parcial_acc_buf; + StaticBuffer + acc_buf; + + // start to compute + auto reduction_idx = + block_idx - block_2_ctile_map_streamk.reduction_start_block_idx; + auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial( + reduction_idx, problem.M, problem.N); + + workgroup_barrier wg_barrier(p_semaphore); + + uint32_t tile_acc_offset_start = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx); + uint32_t tile_acc_offset_end = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx + + 1); + __syncthreads(); + + auto acc_load = ThreadwiseTensorSliceTransfer_v2< + AccDataType, // SrcData, + AccDataType, // DstData, + decltype(c_partial_acc_block_m_n), // SrcDesc, + decltype(acc_thread_buf_load_desc), // DstDesc, + Sequence<1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1>, // DimAccessOrder, + 1, // SrcVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector, + 1, // SrcScalarStrideInVector, + false // SrcResetCoordinateAfterRun, + >{c_partial_acc_block_m_n, + make_multi_index(thread_m_cluster_id, + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock)}; + + auto acc_store = ThreadwiseTensorSliceTransfer_v1r3< + AccDataType, // SrcData, + CDataType, // DstData, + decltype(acc_thread_buf_store_desc), // SrcDesc, + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc, + CElementwiseOperation, // ElementwiseOperation, + Sequence<1, + 1, + 1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1, 2, 3>, // DimAccessOrder, + 3, // DstVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector, + InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp, + 1, // DstScalarStrideInVector, + false // DstResetCoordinateAfterRun, + >{c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]), + thread_m_cluster_id, + __builtin_amdgcn_readfirstlane(spatial_idx[I1]), + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock), + CElementwiseOperation{}}; + + wg_barrier.wait_eq(reduction_idx, tile_acc_offset_end - tile_acc_offset_start); + + if(threadIdx.x == 0) + { + p_semaphore[reduction_idx] = 0; + } + using Accumulation = ck::detail:: + AccumulateWithNanCheck; + + for(int i_m = 0; i_m < MReduceIters; i_m++) + { + static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) { + acc_buf.Clear(); + for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++) + { + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + + i * c_partial_acc_block_m_n.GetElementSpaceSize(), + c_partial_acc_block_m_n.GetElementSpaceSize()); + + acc_load.Run(c_partial_acc_block_m_n, + c_partial_acc_buf, + acc_thread_buf_load_desc, + make_tuple(I0, I0), + parcial_acc_buf); + + static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}( + [&](auto i_vec) { + constexpr auto offset = + acc_thread_buf_load_desc.CalculateOffset( + make_tuple(0, i_vec)); + Accumulation::Calculate(acc_buf(Number{}), + parcial_acc_buf[Number{}]); + }); + } + + if(thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock < + NPerBlock) + { + acc_store.Run(acc_thread_buf_store_desc, + make_tuple(I0, I0, I0, I0), + acc_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + if constexpr(NReduceIters != 1) + { + if constexpr(i_n_reduce != (NReduceIters - 1)) + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n); + } + else + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n_reverse); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n_reverse); + } + } + }); + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_m); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_m); + } + } + + continue; + } + } + + // offset for last acc buffer of this block + uint32_t block_acc_offset = + (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) * + MPerBlock * NPerBlock; while(true) { uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( @@ -1173,33 +1459,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 iter_end - 1, tile_idx, iter_offset); iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); - const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, - problem.MPadded, - problem.K, - problem.KPadded, - problem.StrideA, - problem.AK0); - const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K, - problem.KPadded, - problem.N, - problem.NPadded, - problem.StrideB, - problem.BK0); - const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n, problem.MBlock, problem.NBlock); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto block_work_idx = block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); @@ -1363,11 +1622,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle = + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle(); + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock .GetElementSpaceSize()); + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + block_acc_offset, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle + .GetElementSpaceSize()); + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, @@ -1477,7 +1745,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 c_grid_desc_mblock_mperblock_nblock_nperblock, make_multi_index(block_m_id, 0, block_n_id, 0), c_element_op}; - + // LDS to global partial acc + auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2< + ThisThreadBlock, // index_t BlockSize, + CElementwiseOperation, // ElementwiseOperation, + // InMemoryDataOperationEnum::Set, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * + NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CShuffleDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_multi_index(0, 0, 0, 0), + c_element_op}; // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = SpaceFillingCurve, @@ -1535,15 +1830,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } else if(is_sk_block) { - // each block copy its data from LDS to global - c_shuffle_block_copy_lds_to_global - .template Run( + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Atomic) + { + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + else if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + // constexpr offset + c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, - c_shuffle_block_buf, - c_grid_desc_mblock_mperblock_nblock_nperblock, - c_grid_buf); + make_tuple(0, 0, 0, 0)); + + c_block_copy_lds_to_partial_acc.SetDstSliceOrigin( + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_tuple(MXdlPerWave, 0, NXdlPerWave, 0)); + + c_block_copy_lds_to_partial_acc + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + c_partial_acc_buf); + } } if constexpr(access_id < num_access - 1) @@ -1555,15 +1875,33 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); } }); - } + + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + if(is_sk_block) + { + // increase the counter for this tile + workgroup_barrier wg_barrier(p_semaphore); + wg_barrier.inc(tile_idx); + } + } + } // shuffle c and write-out end + // exit condition iter_end -= current_iter_length; if(iter_end <= iter_start) break; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + block_acc_offset -= MPerBlock * NPerBlock; + } // make sure next loop LDS is ready for use block_sync_lds(); - } - } + } // while loop + + } // for loop } template ( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + uint32_t iter_start, iter_end; - bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block; + bool is_sk_block, is_dp_block, is_reduction_block; index_t num_k_block_main_loop; + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( + problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n, problem.MBlock, problem.NBlock); + + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, + problem.N, + AK0Number * problem.KPadded, + problem.Grid_size, + problem.Streamk_sel); for(auto block_idx = get_block_1d_id(); block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx += gridDim.x) @@ -1601,6 +1963,235 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); num_k_block_main_loop = iter_end - iter_start; + uint32_t* p_semaphore = reinterpret_cast( + reinterpret_cast(p_workspace) + + block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType))); + + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + is_reduction_block = static_cast(block_idx) >= + block_2_ctile_map_streamk.reduction_start_block_idx; + if(is_reduction_block) + { + // descriptors + constexpr auto cluster_length_reduce = GetClusterLengthReduction(); + constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce); + const auto reduce_thread_cluster_idx = + reduce_desc.CalculateBottomIndex(make_multi_index(block_idx)); + const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0]; + const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1]; + + constexpr auto MReduceIters = math::integer_divide_ceil( + Number{}, cluster_length_reduce.At(I0)); + constexpr auto NReduceIters = math::integer_divide_ceil( + Number{}, + cluster_length_reduce.At(I1) * + Number{}); + + constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{})); + constexpr auto acc_thread_buf_store_desc = + make_naive_tensor_descriptor_packed(make_tuple( + I1, I1, I1, Number{})); + + constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor(); + + constexpr auto partial_acc_load_step_n = + make_multi_index(0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_n_reverse = make_multi_index( + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_load_step_m = + make_multi_index(cluster_length_reduce.At(I0), 0); + + constexpr auto partial_acc_store_step_n = + make_multi_index(0, + 0, + 0, + cluster_length_reduce.At(I1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_n_reverse = make_multi_index( + 0, + 0, + 0, + -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) * + CShuffleBlockTransferScalarPerVector_NPerBlock); + constexpr auto partial_acc_store_step_m = + make_multi_index(0, cluster_length_reduce.At(I0), 0, 0); + + StaticBuffer + parcial_acc_buf; + StaticBuffer + acc_buf; + + // start to compute + auto reduction_idx = + block_idx - block_2_ctile_map_streamk.reduction_start_block_idx; + auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial( + reduction_idx, problem.M, problem.N); + + workgroup_barrier wg_barrier(p_semaphore); + + uint32_t tile_acc_offset_start = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx); + uint32_t tile_acc_offset_end = + block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx + + 1); + + uint32_t expected_count = tile_acc_offset_end - tile_acc_offset_start; + + if(threadIdx.x == 0) + { + p_semaphore[reduction_idx] = 0; + } + + __syncthreads(); + + auto acc_load = ThreadwiseTensorSliceTransfer_v2< + AccDataType, // SrcData, + AccDataType, // DstData, + decltype(c_partial_acc_block_m_n), // SrcDesc, + decltype(acc_thread_buf_load_desc), // DstDesc, + Sequence<1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1>, // DimAccessOrder, + 1, // SrcVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector, + 1, // SrcScalarStrideInVector, + false // SrcResetCoordinateAfterRun, + >{c_partial_acc_block_m_n, + make_multi_index(thread_m_cluster_id, + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock)}; + + auto acc_store = ThreadwiseTensorSliceTransfer_v1r3< + AccDataType, // SrcData, + CDataType, // DstData, + decltype(acc_thread_buf_store_desc), // SrcDesc, + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc, + CElementwiseOperation, // ElementwiseOperation, + Sequence<1, + 1, + 1, + CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths, + Sequence<0, 1, 2, 3>, // DimAccessOrder, + 3, // DstVectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector, + InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp, + 1, // DstScalarStrideInVector, + false // DstResetCoordinateAfterRun, + >{c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]), + thread_m_cluster_id, + __builtin_amdgcn_readfirstlane(spatial_idx[I1]), + thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock), + CElementwiseOperation{}}; + +#if 0 + if(threadIdx.x == 0) { + printf("bid:%d, rid:%d, os:%d,%d, spatial:%d,%d\n", static_cast(blockIdx.x), + reduction_idx, __builtin_amdgcn_readfirstlane(tile_acc_offset_start), __builtin_amdgcn_readfirstlane(tile_acc_offset_end), + __builtin_amdgcn_readfirstlane(spatial_idx[I0]), + __builtin_amdgcn_readfirstlane(spatial_idx[I1])); + } +#endif + if(threadIdx.x == 0) + { + atomicAdd(&p_semaphore[reduction_idx], 1); + } + + wg_barrier.wait_eq(p_semaphore[reduction_idx], expected_count); + using Accumulation = ck::detail:: + AccumulateWithNanCheck; + + for(int i_m = 0; i_m < MReduceIters; i_m++) + { + static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) { + acc_buf.Clear(); + for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++) + { + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + + i * c_partial_acc_block_m_n.GetElementSpaceSize(), + c_partial_acc_block_m_n.GetElementSpaceSize()); + + acc_load.Run(c_partial_acc_block_m_n, + c_partial_acc_buf, + acc_thread_buf_load_desc, + make_tuple(I0, I0), + parcial_acc_buf); + + static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}( + [&](auto i_vec) { + constexpr auto offset = + acc_thread_buf_load_desc.CalculateOffset( + make_tuple(0, i_vec)); + Accumulation::Calculate(acc_buf(Number{}), + parcial_acc_buf[Number{}]); + }); + } + + if(thread_n_cluster_id * + CShuffleBlockTransferScalarPerVector_NPerBlock < + NPerBlock) + { + acc_store.Run(acc_thread_buf_store_desc, + make_tuple(I0, I0, I0, I0), + acc_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + if constexpr(NReduceIters != 1) + { + if constexpr(i_n_reduce != (NReduceIters - 1)) + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n); + } + else + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_n_reverse); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_n_reverse); + } + } + }); + { + acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n, + partial_acc_load_step_m); + acc_store.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, + partial_acc_store_step_m); + } + } + + continue; + } + } + + // offset for last acc buffer of this block + uint32_t block_acc_offset = + (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) * + MPerBlock * NPerBlock; + while(true) { uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( @@ -1611,33 +2202,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 iter_end - 1, tile_idx, iter_offset); iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); - const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, - problem.MPadded, - problem.K, - problem.KPadded, - problem.StrideA, - problem.AK0); - const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K, - problem.KPadded, - problem.N, - problem.NPadded, - problem.StrideB, - problem.BK0); - const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N( - problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC); - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n, problem.MBlock, problem.NBlock); - - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto block_work_idx = block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); @@ -1811,11 +2375,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle = + GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle(); + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared_0), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock .GetElementSpaceSize()); + auto c_partial_acc_buf = + make_dynamic_buffer( + reinterpret_cast(p_workspace) + block_acc_offset, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle + .GetElementSpaceSize()); + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, @@ -1925,6 +2498,35 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 make_multi_index(block_m_id, 0, block_n_id, 0), c_element_op}; + // LDS to global partial acc + auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2< + ThisThreadBlock, // index_t BlockSize, + CElementwiseOperation, // ElementwiseOperation, + // InMemoryDataOperationEnum::Set, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * + NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CShuffleDataType, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be + // false, othre wise has scratch + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_multi_index(0, 0, 0, 0), + c_element_op}; + // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = SpaceFillingCurve, @@ -1982,15 +2584,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } else if(is_sk_block) { - // each block copy its data from LDS to global - c_shuffle_block_copy_lds_to_global - .template Run( + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Atomic) + { + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + } + else if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + // constexpr offset + c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin( c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, - c_shuffle_block_buf, - c_grid_desc_mblock_mperblock_nblock_nperblock, - c_grid_buf); + make_tuple(0, 0, 0, 0)); + + c_block_copy_lds_to_partial_acc.SetDstSliceOrigin( + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + make_tuple(MXdlPerWave, 0, NXdlPerWave, 0)); + + c_block_copy_lds_to_partial_acc + .template Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle, + c_partial_acc_buf); + } } if constexpr(access_id < num_access - 1) { @@ -2002,6 +2629,27 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } }); } + // exit condition + iter_end -= current_iter_length; + if(iter_end <= iter_start) + break; + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + block_acc_offset -= MPerBlock * NPerBlock; + } + // make sure next loop LDS is ready for use + block_sync_lds(); + } + if constexpr(Block2CTileMap_streamk::ReductionStrategy == + StreamKReductionStrategy::Reduction) + { + if(is_sk_block) + { + // increase the counter for this tile + workgroup_barrier wg_barrier(p_semaphore); + wg_barrier.inc(0); + } } } } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp index 19fa6c209f..f44c025177 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp @@ -237,6 +237,206 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin PassThrough, PassThrough>>>& instances); #endif + +#if(defined(CK_ENABLE_FP8)) +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances); +#endif + template && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + op_ptrs); + + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + op_ptrs); + add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + op_ptrs); + } + } +#endif + return op_ptrs; } }; diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 6a1558a525..2c0b6c7b75 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -87,6 +87,12 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM ARGN "${source}") endif() endforeach() + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_") + message("removing gemm_universal_streamk_f8 instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() endif() #only continue if there are some source files left on the list if(ARGN) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt index 2a930ab9ae..08746a52d7 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt @@ -21,6 +21,49 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) + + device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp + + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp + + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp + device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp new file mode 100644 index 0000000000..d03002af5c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 4, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 4, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 4, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp new file mode 100644 index 0000000000..239d3a67f7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp new file mode 100644 index 0000000000..9b65bbe9b3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 0000000000..38cda9bf86 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp new file mode 100644 index 0000000000..2afa4d5d6a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp new file mode 100644 index 0000000000..0f7dad4c53 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 0000000000..5968176941 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 0000000000..c4423e4577 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp new file mode 100644 index 0000000000..06f701f48c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 0000000000..fda53c689e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 0000000000..9272c74d73 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp new file mode 100644 index 0000000000..7736f38cb2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 16, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 16, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 8, 16, 32, 32, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 8, 16, 16, 16, 4, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 8, 16, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 8, 16, 16, 16, 2, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 16, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 8, 16, 16, 16, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 8, 16, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 8, 16, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 8, 16, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 8, 16, 32, 32, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 128, 8, 16, 16, 16, 1, 4, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F8, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 128, 8, 16, 32, 32, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 0000000000..4701d951a0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100644 index 0000000000..cb57860da7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 0000000000..67be95888a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp new file mode 100755 index 0000000000..f9e46a5f2b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 0000000000..419fcebddc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 0000000000..7cbbc1813d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 0000000000..e3ae258288 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 0000000000..0c6aa0a4e0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 0000000000..75871166a4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp new file mode 100644 index 0000000000..8c91bc877c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp new file mode 100644 index 0000000000..57b6ab3ae2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 128, 16, 8, 32, 32, 4, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 192, 256, 64, 16, 8, 32, 32, 3, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 128, 16, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code. + // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 2, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 128, 16, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 128, 16, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 16, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 16, 2, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 16, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 16, 2, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 2, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 2, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<32, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp new file mode 100644 index 0000000000..51a51d3c28 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp new file mode 100644 index 0000000000..7613f5076e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 0000000000..d015086f38 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp new file mode 100644 index 0000000000..4cb327f4f5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp new file mode 100644 index 0000000000..19b49c1f3f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 0000000000..9dd02b6e95 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 0000000000..e54568eaa1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp new file mode 100644 index 0000000000..cd1e176480 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 0000000000..7996c4441d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp new file mode 100755 index 0000000000..c2544be5f3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp new file mode 100644 index 0000000000..14bd36d29f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F8 = f8_t; +using F16 = half_t; +using F32 = float; + +using Row = tensor_layout::gemm::RowMajor; +using Col = tensor_layout::gemm::ColumnMajor; + +template +using S = Sequence; + +using PassThrough = element_wise::PassThrough; + +static constexpr auto GemmDefault = GemmSpecialization::Default; +static constexpr auto GemmKPadding = GemmSpecialization::KPadding; +static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; +static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Compute friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 16, 8, 16, 16, 8, 7, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 64, 1, 4>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 16, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> + #endif + // clang-format on + >; + +template +using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances = std::tuple< +// clang-format off + #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) + //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + + // Latency friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, + // Memory friendly + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 128, 16, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 128, 16, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 128, 16, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 128, 16, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 128, 16, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 128, 16, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 16, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 128, 16, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 128, 16, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 128, 16, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 128, 16, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, + DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F8, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 128, 16, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> + #endif + // clang-format on + >; +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp new file mode 100644 index 0000000000..eefc776151 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp new file mode 100644 index 0000000000..185874b249 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp new file mode 100644 index 0000000000..a92181ccc0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp new file mode 100755 index 0000000000..1551dba0fe --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp new file mode 100644 index 0000000000..0f3e51db18 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp new file mode 100644 index 0000000000..f87b8f6700 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp new file mode 100644 index 0000000000..0058a2ad6f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp new file mode 100644 index 0000000000..3a3bd5df9a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp new file mode 100644 index 0000000000..fb50e2589b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp new file mode 100644 index 0000000000..6413655b60 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, + device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt deleted file mode 100644 index 2a930ab9ae..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -# ONLY XDL_KERNELS -set(GEMM_UNIVERSAL_STREAMK_INSTANCES) - -list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp - device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp) - -add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp deleted file mode 100644 index 6e8d5c798b..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = half_t; -using F32 = float; - -using Row = tensor_layout::gemm::RowMajor; -using Col = tensor_layout::gemm::ColumnMajor; - -template -using S = Sequence; - -using PassThrough = element_wise::PassThrough; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Latency friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 2, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 2, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 4, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 4, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 4, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 4, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 4, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 4, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 4, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 4, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp deleted file mode 100644 index 6adcb8f4f4..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp deleted file mode 100644 index 631ae6872f..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp deleted file mode 100644 index 2c49773a65..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp deleted file mode 100644 index 39d54fb885..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp deleted file mode 100644 index 8ee50d63cb..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp deleted file mode 100644 index d31e0819a4..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp deleted file mode 100644 index fe19f35e53..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp deleted file mode 100644 index 6c1873b373..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp deleted file mode 100644 index ffd53f4069..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp deleted file mode 100644 index 094b8f92f8..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp deleted file mode 100644 index e00c1733e0..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = half_t; -using F32 = float; - -using Row = tensor_layout::gemm::RowMajor; -using Col = tensor_layout::gemm::ColumnMajor; - -template -using S = Sequence; - -using PassThrough = element_wise::PassThrough; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Compute friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // AGPR Spill - // DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 256, 32, 8, 8, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - // AGPR Spill when use permuted lds layout. so, use padding for these two. - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 64, 8, 8, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 64, 8, 8, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 2, 1, S<1, 64, 1, 4>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 128, 128, 64, 8, 8, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1> - // clang-format on - >; - -template -using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple< - // clang-format off - //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| - //#########################| | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - - // Latency friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 32, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 32, 64, 8, 8, 32, 32, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 128, 16, 64, 8, 8, 16, 16, 4, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 32, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 64, 16, 64, 8, 8, 16, 16, 2, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 2, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 128, 8, 8, 16, 16, 1, 1, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 64, 16, 16, 64, 8, 8, 16, 16, 1, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 4>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 32, 64, 8, 8, 16, 16, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 64, 64, 8, 8, 16, 16, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 64, 64, 8, 8, 32, 32, 1, 1, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 16, 128, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 128, 32, 128, 64, 8, 8, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 8>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 16, 256, 64, 8, 8, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 4, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGemm_Xdl_CShuffle_Streamk_V3< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 32, 256, 64, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 1, S<1, 16, 1, 16>, 8, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp deleted file mode 100644 index 546f909b3c..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp deleted file mode 100644 index d91de96be3..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp deleted file mode 100644 index c70678b449..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp deleted file mode 100644 index 5410a0cc25..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp deleted file mode 100644 index 4ae7329f98..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp deleted file mode 100644 index 4fc5458a96..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp deleted file mode 100644 index 7369f87a57..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp deleted file mode 100644 index 45425a41a1..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp deleted file mode 100644 index 3b5ac0366f..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp deleted file mode 100644 index 53aa011a75..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/modified_files.txt b/modified_files.txt new file mode 100755 index 0000000000..34a42e3f37 --- /dev/null +++ b/modified_files.txt @@ -0,0 +1,10 @@ +example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp +example/01_gemm/run_gemm_example_streamk_v2.inc +include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp +library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp +profiler/src/profile_gemm_universal_streamk.cpp +modified_files.txt diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp old mode 100644 new mode 100755 index cd3f5787d6..85f6c25770 --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -85,8 +85,10 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) using F32 = float; using F16 = ck::half_t; - // using BF16 = ck::bhalf_t; - // using F8 = ck::f8_t; + +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + using F8 = ck::f8_t; +#endif using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -145,6 +147,24 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) { return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); } +#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) + else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } + else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } +#endif else { std::cout << "this data_type & layout is not implemented" << std::endl; From 4c7035ff08f17aa138a747b8ea00ccf47276d85c Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 08:30:01 -0800 Subject: [PATCH 06/52] fix path of ninjatracing (#1685) --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index b06726335a..76e6f0ebea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,17 +87,17 @@ RUN pip install --upgrade cmake==3.27.5 && \ git clone https://github.com/ccache/ccache.git && \ cd ccache && mkdir build && cd build && cmake .. && make install && \ #Install ninja build tracing tools + cd / && \ wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \ gunzip /usr/local/bin/ninja.gz && \ chmod a+x /usr/local/bin/ninja && \ git clone https://github.com/nico/ninjatracing.git && \ #Install latest cppcheck git clone https://github.com/danmar/cppcheck.git && \ - cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . -WORKDIR / - + cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \ + cd / && \ # Install an init system -RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ + wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \ # Install packages for processing the performance results pip3 install --upgrade pip && \ From ff92222f937b54955011d394f46130fc5002110c Mon Sep 17 00:00:00 2001 From: schung-amd Date: Fri, 22 Nov 2024 17:51:35 -0500 Subject: [PATCH 07/52] [CK_TILE] MakeKargs overloads for backward compatibility (#1681) * Add overloads for MakeKargs Overload MakeKargs to accept std::tuple and std::tuple to preserve functionality of code currently passing in list initializers or tuples. * Add overloads for MakeKargs Overload MakeKargs to accept std::tuple and std::tuple to preserve functionality of code currently passing in list initializers or tuples. * Re-format files using ck_tile remod.py --------- Co-authored-by: Po Yen Chen --- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 444 ++++++++++++++++++ .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 338 +++++++++++++ 2 files changed, 782 insertions(+) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index c5858a20f7..ccf15ee600 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -470,6 +470,248 @@ struct FmhaBwdDQDKDVKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_do, + batch_stride_lsed, + batch_stride_dq_acc, + batch_stride_dk, + batch_stride_dv, + batch_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_do, + batch_stride_lsed, + batch_stride_dq_acc, + batch_stride_dk, + batch_stride_dv, + batch_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -616,6 +858,208 @@ struct FmhaBwdDQDKDVKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + lse_ptr, + do_ptr, + d_ptr, + rand_val_ptr, + dk_ptr, + dv_ptr, + dbias_ptr, + dq_acc_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_do, + stride_dq_acc, + stride_dk, + stride_dv, + stride_dbias, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_do, + nhead_stride_lsed, + nhead_stride_dq_acc, + nhead_stride_dk, + nhead_stride_dv, + nhead_stride_dbias, + split_stride_dq_acc, + window_size_left, + window_size_right, + mask_type, + p_drop, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_) { diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index e0c145fde7..4443a45038 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -399,6 +399,186 @@ struct FmhaFwdKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + MakeKargs(q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + MakeKargs(q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + template __host__ static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -522,6 +702,164 @@ struct FmhaFwdKernel return kargs; } + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + + // std::variant can't take in a list initializer, overload for backward compatibility + template + __host__ static constexpr std::enable_if_t + MakeKargs(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + const std::tuple& drop_seed_offset) + { + return MakeKargs( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqstart_q_ptr, + seqstart_k_ptr, + seqlen_k_ptr, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + } + __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_q_, From a420b3b34d2ad3e897aec824288182cf1e442dd6 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:30:12 -0800 Subject: [PATCH 08/52] add Andriy to the code owners (#1687) --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5340be274b..d7a6b17783 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca # Documentation files -docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk -.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca +.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca # Header directory for Doxygen documentation -library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk +library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca From 19d4b790399e479abd66d6555265fd7cd6389931 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:16:08 -0800 Subject: [PATCH 09/52] add --squash flag when building dockers (#1686) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b79b2045b0..2f790d8e5b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -132,7 +132,7 @@ def buildDocker(install_prefix){ checkout scm def image_name = getDockerImageName() echo "Building Docker for ${image_name}" - def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " + def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ dockerArgs = dockerArgs + " --no-cache " } From ce2bdf42a9c7d78e60d16cfb00581c83a0bfc49c Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Mon, 25 Nov 2024 12:31:38 +0800 Subject: [PATCH 10/52] Change in fwd-splitkv kernel to support num_splits=1 case (#1690) * Change in fwd-splitkv kernel to support num_splits=1 case * Update in codegen fwd-splitkv to make num_splits > 1 cases pass * Specify instance traits in dispatch * Fix link error for fp8 kernels --------- Co-authored-by: Po Yen Chen --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 43 +++++++++++-------- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 19 +++++--- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 3 +- .../ops/fmha/pipeline/tile_fmha_traits.hpp | 2 +- 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index d1da951567..1c40cf6f31 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -247,12 +247,22 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const }} """ -FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) && +FMHA_FWD_SPLITKV_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) && ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{ - using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; - using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>; + using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; + if (t.has_lse) {{ + if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{ + return -1; + }} else {{ + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>; - return fmha_fwd_splitkv_(s, a); + return fmha_fwd_splitkv_(s, a); + }} + }} else {{ + using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>; + + return fmha_fwd_splitkv_(s, a); + }} }} """ @@ -614,27 +624,26 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> squant = 't' if dtype == 'fp8' else 'f' pipelines = [] if dtype in ['fp16', 'bf16']: - for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]): + for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: - pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) else: - pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) if receipt == 1: - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim - pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim elif dtype in ['fp8', 'bf8']: - # no need lse/paged-kv kernels for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): - pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', squant, 'f', mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask)) else: assert False return pipelines diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 98a4329d75..3c4e02d08b 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -35,6 +35,7 @@ struct FmhaFwdSplitKVKernel using LSEDataType = ck_tile::remove_cvref_t; using SaccDataType = ck_tile::remove_cvref_t; using OaccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; using VLayout = ck_tile::remove_cvref_t; @@ -234,8 +235,10 @@ struct FmhaFwdSplitKVKernel const void* k_ptr, const void* v_ptr, const void* bias_ptr, - void* lse_acc_ptr, - void* o_acc_ptr, + void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise + final lse */ + void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final + o */ ck_tile::index_t batch, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k, // only used if 'seqlen_k_ptr' is not specified @@ -356,8 +359,10 @@ struct FmhaFwdSplitKVKernel const void* k_ptr, const void* v_ptr, const void* bias_ptr, - void* lse_acc_ptr, - void* o_acc_ptr, + void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise + final lse */ + void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final + o */ ck_tile::index_t batch, const void* seqstart_q_ptr, const void* seqstart_k_ptr, @@ -591,9 +596,9 @@ struct FmhaFwdSplitKVKernel static_cast(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v + batch_offset_v; - OaccDataType* o_acc_ptr = reinterpret_cast(kargs.o_acc_ptr) + - static_cast(i_nhead) * kargs.nhead_stride_o_acc + - batch_offset_o_acc + i_split * kargs.split_stride_o_acc; + ODataType* o_acc_ptr = reinterpret_cast(kargs.o_acc_ptr) + + static_cast(i_nhead) * kargs.nhead_stride_o_acc + + batch_offset_o_acc + i_split * kargs.split_stride_o_acc; // Q/K/V DRAM and DRAM window const auto q_dram = [&]() { diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 71c3bd1715..4e8d8694d7 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -25,6 +25,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS using LSEDataType = remove_cvref_t; using PDataType = remove_cvref_t; using OaccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; using FmhaMask = remove_cvref_t; using BlockFmhaShape = remove_cvref_t; @@ -48,7 +49,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; - static constexpr bool kStoreLSE = true; // always store LSE (acc) + static constexpr bool kStoreLSE = Problem::kStoreLSE; static constexpr bool kIsPagedKV = Problem::kIsPagedKV; static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits; diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp index e3187042d2..d7bf8ea7e7 100644 --- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp +++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp @@ -39,7 +39,7 @@ template 1 or fwd training is running */ bool kDoFp8StaticQuant_, bool kIsPagedKV_, bool kHasUnevenSplits_, From 36c7ce4e0eef86df186f8d796d7e177b8b13df92 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 25 Nov 2024 13:12:35 +0800 Subject: [PATCH 11/52] [CK_TILE]Moe update index (#1672) * update MOCK_ID for moe-sorting * add moe-smoothquant * update a comment * fix format * hot fix * update topk in overflow case * update comments * update bf16 cvt --------- Co-authored-by: valarLip <340077269@qq.com> --- .../ck_tile/14_moe_smoothquant/CMakeLists.txt | 25 ++ example/ck_tile/14_moe_smoothquant/README.md | 15 + .../moe_smoothquant_bf16_n1024_instance.cpp | 22 ++ .../moe_smoothquant_bf16_n1536_instance.cpp | 13 + .../moe_smoothquant_bf16_n2048_instance.cpp | 14 + .../moe_smoothquant_bf16_n256_instance.cpp | 12 + .../moe_smoothquant_bf16_n3072_instance.cpp | 14 + .../moe_smoothquant_bf16_n4096_instance.cpp | 14 + ...moe_smoothquant_bf16_n4096_tp_instance.cpp | 14 + .../moe_smoothquant_bf16_n512_instance.cpp | 13 + ...moe_smoothquant_bf16_n64_n128_instance.cpp | 12 + .../moe_smoothquant_bf16_n768_instance.cpp | 12 + .../moe_smoothquant_fp16_n1024_instance.cpp | 22 ++ .../moe_smoothquant_fp16_n1536_instance.cpp | 13 + .../moe_smoothquant_fp16_n2048_instance.cpp | 14 + .../moe_smoothquant_fp16_n256_instance.cpp | 12 + .../moe_smoothquant_fp16_n3072_instance.cpp | 14 + .../moe_smoothquant_fp16_n4096_instance.cpp | 14 + ...moe_smoothquant_fp16_n4096_tp_instance.cpp | 14 + .../moe_smoothquant_fp16_n512_instance.cpp | 13 + ...moe_smoothquant_fp16_n64_n128_instance.cpp | 12 + .../moe_smoothquant_fp16_n768_instance.cpp | 12 + .../instances/moe_smoothquant_fwd_api.cpp | 145 ++++++++++ .../moe_smoothquant_instance_common.hpp | 62 ++++ .../14_moe_smoothquant/misc/moe-sm.png | Bin 0 -> 206879 bytes .../14_moe_smoothquant/moe_smoothquant.cpp | 264 ++++++++++++++++++ .../14_moe_smoothquant/moe_smoothquant.hpp | 114 ++++++++ .../14_moe_smoothquant/script/perf_test.sh | 37 +++ .../14_moe_smoothquant/script/smoke_test.sh | 30 ++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/core/config.hpp | 5 + include/ck_tile/core/numeric/bfloat16.hpp | 36 +++ .../host/reference/reference_moe_sorting.hpp | 29 +- .../fused_moe/kernel/moe_sorting_kernel.hpp | 83 +++++- include/ck_tile/ops/smoothquant.hpp | 1 + .../kernel/moe_smoothquant_kernel.hpp | 205 ++++++++++++++ 36 files changed, 1321 insertions(+), 11 deletions(-) create mode 100644 example/ck_tile/14_moe_smoothquant/CMakeLists.txt create mode 100644 example/ck_tile/14_moe_smoothquant/README.md create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp create mode 100644 example/ck_tile/14_moe_smoothquant/misc/moe-sm.png create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp create mode 100755 example/ck_tile/14_moe_smoothquant/script/perf_test.sh create mode 100755 example/ck_tile/14_moe_smoothquant/script/smoke_test.sh create mode 100644 include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt new file mode 100644 index 0000000000..12224a39a2 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt @@ -0,0 +1,25 @@ +function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC) + message("adding ${TARGET_NAME}") + # not using add_example_executable() to add target, since we don't want this to have + # to be included in "make all/install/check" + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + + foreach(source IN LISTS ARGN) + list(APPEND INSTANCE_SRCS ${source}) + endforeach() + + target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS}) + + set(COMPILE_OPTIONS) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + # list(APPEND COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + + target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) +endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC) + +file(GLOB INSTANCE_SRCS instances/*.cpp) + +add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS}) + diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md new file mode 100644 index 0000000000..599b4c3489 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/README.md @@ -0,0 +1,15 @@ +# moe-smoothquant + +This folder contains example for moe-smoothquant using ck_tile tile-programming implementation. +![](misc/moe-sm.png) + +Unlike standard smoothquant op, the input scale is from different expert `[expert, hidden]`, we need reuse the `topk-id` from previous `topk-softmax` and select the corresponding `expert` from current topk, and expand the output/per-token-scale by `topk` + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_moe_smoothquant -j +``` +This will result in an executable `build/bin/tile_example_moe_smoothquant` diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp new file mode 100644 index 0000000000..f43626147f --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp new file mode 100644 index 0000000000..e380520fce --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp new file mode 100644 index 0000000000..4d536cd61d --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp new file mode 100644 index 0000000000..b38a4733a4 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp new file mode 100644 index 0000000000..c5c170aef1 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp new file mode 100644 index 0000000000..0e48a1b691 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp new file mode 100644 index 0000000000..4af42c6c80 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp new file mode 100644 index 0000000000..ea611a1834 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp new file mode 100644 index 0000000000..a6209820e6 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp new file mode 100644 index 0000000000..f569dedf35 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp new file mode 100644 index 0000000000..3793adb5c5 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp @@ -0,0 +1,22 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp new file mode 100644 index 0000000000..4bf9cb1a49 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp new file mode 100644 index 0000000000..eb0d0fe103 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp new file mode 100644 index 0000000000..36bc0de150 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp new file mode 100644 index 0000000000..fa6f53b2d4 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp new file mode 100644 index 0000000000..9b7462ab92 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp new file mode 100644 index 0000000000..8911bc2295 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp @@ -0,0 +1,14 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp new file mode 100644 index 0000000000..07783ac168 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp @@ -0,0 +1,13 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp new file mode 100644 index 0000000000..a5ab56a76c --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp new file mode 100644 index 0000000000..4272cbafc6 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp @@ -0,0 +1,12 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp new file mode 100644 index 0000000000..a65d3fde66 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "moe_smoothquant.hpp" + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + float r = -1; + // clang-format off + // rm rn tm tn vn pd 2p + if(a.hidden_size <= 64) { + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 128) { + if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 256) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 512) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 768) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1024) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1536) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 2048) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 3072) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size > 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + return r; + // clang-format on +} + +float moe_smoothquant(moe_smoothquant_traits t, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + if(t.data_type.compare("fp16") == 0) + { + return moe_smoothquant_dispatch(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return moe_smoothquant_dispatch(t, a, s); + } + else + throw std::runtime_error("Without supported instances!"); +} diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp new file mode 100644 index 0000000000..88d3000910 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "moe_smoothquant.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = moe_smoothquant_args; + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::SmoothquantPipelineProblem< + typename MoeSmoothquantTypeConfig::XDataType, + typename MoeSmoothquantTypeConfig::XScaleDataType, + typename MoeSmoothquantTypeConfig::ComputeDataType, + typename MoeSmoothquantTypeConfig::YScaleDataType, + typename MoeSmoothquantTypeConfig::QYDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass; + using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::MoeSmoothquant; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png b/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png new file mode 100644 index 0000000000000000000000000000000000000000..5a40099ef3ce3860ed133e4b150ad4785108f129 GIT binary patch literal 206879 zcmdSBWmjC$)-{+ABzW);EVx5(*WeDpHMmo_YjAfhoZv2n6&48Y?uEO%Ls#xIp8Iz9 z=s(c)ZJlvy>?3>0oNKNVuB0G|iu48P-Me?F(o$k7@7}@Dyn6=&`3M6&LX~5o_U_%= zo3xmSs{7J$7TnJnsb&8}o4F>4O0JBK46R*ZC?9rc&*~1hE;@JS2Q;^)%~KV^h#$GZ zD%vH&t1dp|YbL2%;J{TzLhVzZ*Qa~%;4yHJB7J)?ew|_12)e}o{=D(+KLP&Nc=zt! z(ck^Uf6XpD*uMn-wcd4n{R;U%);Zk!3;6$<{>&f4xc+Os^A-_qd;ec^9T{d5^}p6T zgHQjDC4c?@c*%$;cY4}g)9et%_1(n0tPSHdFr`7)7QTF3Wgg}Zwsab@D2t7jHAarP zx6UFRot89)8F?D=f6mkRNAQY}HU?$hVk@tWF)FO0^g7WNoHL5V>C zGK?oOjRJaxUr~z9z=01vtQT;+JiVB~;`WIN&EZaXC&5!Tu&QbwJ(^RKC;n=4D(4MP zvj&TOy-;Bl92Xhb6+IkHHoqT*>sM!%M9qd>H3*Tc<`gXntpO*9;JX9)@sF zf|LF2&t9A|B@M zPAls?5}o6*U-ZKQ)_KZP2U%YA-X=jb^@k=0<=1ZwctvQQDgm7RHpJJ;-wX)en|U(} z=4Q|xOH;}A>aFh|a`{>RbBYdn=q@slqp@&3r!?DqIDV_CNKSGqZr9MliruBpHi>G_ z3`=)c@eb6+6DVmbM*O3BOJV$cLlFf&%4oK&W1#&y2G`8xQ2VX*RY^`i6^lhmWMns> z71A!(o}f}deg79S((S`ZL78gDXT?N zn@FH<+@U4>qg4b30Bw)hYg8;ZO`)SPG^6#4UOL7614*R9$gELN%SXN8#bfJZtHba) z2T3u{fH7UzjM5W2@FHY*e@GWp_0TeZhqaxCZ}2<)(GIs*r$c$G%Oz4FSk8LXwMQ2FCy{TQ2CH$iQZ;JWWSo;VYUVv#ZnW#X?uhN_E zRn#`Vnc)}m2J0wCKcyiD>pPaB=(|C`M^O|wb;N7-gQN*`tI`(kOZkpOmX3jD}dW0M=H7T6De9@=WN)u|%R>z+DiVY2{Yje<`71H8cIBu2gYGrhl z5MIvOrIItLT2Kb>@*;iL?9~2HVzuLPR%}1boG#9vx+8*C6nb-WXBW+svdi1OwB|S~ zD<}V_+`2{y4Z%T`2cqc75*jDG)Rb)HrtRY|z0p-8N(T|PD0@}NZ25)8@ahf=Q(cSu zLkr&W(FK!>4`$5vL~JDyFT3zeMRQkN0)=s&z)F*h)JDOzvZTpmOBvhg>FVqv8kX#q zkMALK)Ie}N72W&!)p$fLD<=$#B5%aSX{N&HD%Jgcf41sh20K7-m*pHd?OsIE+2xDS zz7SDl$5U&UH>I)ppYdaZv3u>56UT#_ymP`R024Z;4J5NbTc#C- z;i6Q=?$JgH`w*>nn8N<5>T{^(yX;h5F(Np*TUr7Z@|tx{`mE>0&jje26(b5E4v5+d zFw|-=vi`8AK&&o-_0_h@fqTi@W4E-_m~mw(m6doyx~S}7Q3H;*Pc@N)7c_$nu28?g z*g1)&5*|7v{A@_O33PyXGYWvpdib%%|FU>w;F{}Gmea7=8qst&l%YAvi%m?)g?C9Y z_GDO2CLE=8m{v5+alNW$l}&)S3GT=gsn^pZzvV{FQZfCyU^l&!Y)|=NMpi;oR9b)S z?0->w=#4tNLz-BMx;LzlQn7lLU~0g>=uB!+C9V6@G%@$ z4Wo*cwZxQG0}JndswN82O=VOSV}EmM%El2uqJx!szBv&k?e>E)m zPNP0Qt>q`yroF;@(Xx+}Hx;2x)!}=WT_wi`zY-B)sO5C^*oPw!y)}&YcA5G>4^}E7 z1y~Njsu*;KGGGPmcLWdb&mBv>}G`vy6+yQ4Y2-1Inq(Pu(&qQXHV- zaO>D9fUltN-o|o~sVXD?u(Dfc5;Tu`6j|dJ#*~xwsX7bq_3@P}?W9!AsPPi^>=~Mk z=!mcc`q<#kFxat+>UI?M7Y4ynif`_cpH~iP7^cwi8A(^2)EaWysMBe{Q$& zXPeE{r1nV<%qd-1W?8{9x~$(YhwWc!(8&RmXIumU;AVGsi{dG=DR=2HzqAepepxeK znBF?dzZ64(P032;4Cu@GbqEHs2^WYsqnvIj*{uKdo8RtxmX=YR@wO?T__(HJuZ+W} z)=TNWH2UC1_aBoVhSRMs^j|5ASE5ZuY$R0z{J9C%=N1<4ul6_6+Qz7jSqw=%nkOse z^spEq2>DJER|tI>Cr2UmU|{nt`=q`PwVHDIIWEA{fZ)zfzmJ?>PGgQ2qkKoZO@vW@-F3E^fK4DZ-%CE#W~Wga;tVx=86$AqS{&XSR%zqezuv6E1~2x}H0A-1 zP5cLO&*(x6zW*4J_^FAmqbwK8+83E_<_nEEho+>AkZjlPa(+iK~Evu|-i!A5Dl$!*&5(w`%A@3;MJf_SEL+f!@;BL`F#c_bD@+xlLPI zs__Qr@*gYr0Hi`^YY;9miftw6c%4a(vNatpMp`2Cg0NXIUF=ZQ$cZ|K=i!Wt`{^5 zXT~xvB@XGI!D6{u65sI+A{AgsV}NJh0Aaa>(Ly-60J3NbB!c#zGzbN70b3PuUyp`!jV$M zW6UC%#V4(3D$ahUI-~L zJ4!`fJUbVT#laK4 zObCmReLO+Pp;@>647(GLG!ui(yW&uswdJ8^JsKAjI_aw(G-X&tdS6TWt?#d&Q`!?K z%#jZ)W0I2+aOAHKW+$AbT4fAfo!KMS87PN=VgAkWWE#L|f8v3Yzio$TeO|)3)}-t| z3ITI$cz^V}%WddcIXWKgGRW4BCnTN!NI!_3Z1(w~7-_*{*Yt3Q_HD%tlJf>8(3>WQg|R)PJr6_9<9s$cYWi zi|uAa=8}X&Xw>2Xtuz!>EP7aHgVix^WTi}D5;3}+8u(Kz5k^k3e>qjEx&QURpHby_+-TQ8snQm?H)4olA z_dyhH3wVS1584#RfP%$)Pl}VU!Il(-G(UOlci#Oq*yN6>9og2IhcY9)+9Iu=C9LeDn4-dM%pX2!lyOD~+!kY@i zMl_(0LOgkU)xqNre$Yqlwt#%30%Nf%=;fR<#>f4Q=r2Wtv&!hT;+A?X(R$X|`w0Cn zO(q#RBsCE!gU~&k)L2>B%`3H1OYXAL3HBWcY8aQ&%!A=y0!j>IDN#R0R)(TUR_xKX zMEHIdQt1Tmkxw!C-#{sJUz$epKLl(Uk;Z=grdmUX^Syqgxc_-$lEsD*eCC`D2VJHM zTU?@zgFUea$Fibc_1ccq*C@~g`Qd%9R{)$jyp`AP+WAwKgpxt?)(0`U@we6p%+w2Y2Od}_c$ zy0>3@#*-5JExaaCK5Hm*19ZeaAB+oIkhXR^Y8Krk(%~K(5|v;Rk}9}c)|)%{7M$@o zWAFFjKU-HB2{MwpSA=ar9b?GFS_`Yio2JigR&fE;4uuZ_y59IaVx2U~Mfm}2%vvUELmKXU|Yee^7c+1fkZGC z<6JtBi*|M?2KAZx)~cY8##uwxW&OBx{nM||tLv-q36!m$rY$DUfITDE5S|5hei&ni z3=z8CebYEf`(Kiuq~O*pYs9T2Fli`d^O6`+vG+R0zW3d}I@=Yo%GqPYfoek*xIu>jD~o+hm$f#!vC@u=wy03#$v?s zWlQFBFsa9WiQ`UeqcIIx()~%EBnc*49aSYHD;`*_VvQHGuUsv|Q>orhR6p*J{XfzE zU#wm~dcTQ^&Ev~|AyGx6QTjBl2mFyM%+*!oi$lyGJ32;h&t%u>arwzwc5 z4mqy*)x~%g$aqUbx4RigJ#K{QgpEWT$+x7%QK8;pT1l6}DxJ4lhB`!a$$vXi8H!k% zj_&~{g?Ca>N$1(xP^!`1;N;N$=nz==y+B`HtCQfx)|wW`RpBKib{*_xIGE0ZYb0G6 z0r{}R-T+EybI)>pG#hx?V@Yp;qtVKW5N!W&7#`&Poyy+lgPo`PY3&iZD48*v)0GRm zEr9aM!O8B_ptLoX+Htcf76s~K1tma3<xQJ1qL1ysf=dyPDz&_k&3HYst@_6i-DoG6*f*B^V$nQ7&l7zc8D7lY z2z@(&jQNWj`@Cy`hMP|HPqpEep;fD>k)FxLN{DbK9A_%+I$U94^143zgyPDkt6Az% z2eICK)QYdmtpe54^>Na7fYlPWVO~1a!aEbs0TO~K$1AB!u4cWh9D-R@YuX0O=$O`v z{l(~-l7+uBy1|&VEd|jP+FIjtpmH#FJ;yQIQYt|nMkGwEHSJPM+G^$>J*7uvx9`*S zPH>_kl}LqNT-9e5fau!`>Fc9(AbN?CQj(L;9a5fuET>+3)ZU0lmODSBSja8lKgGiT zYx~4Ja?_p87cK`;&|)H`L>+@(_k$-=t0yl@UwB(_>j|2y%HeYo>3fH4LbJ zf913L0X~chIJv~embvAn^no-MY&al3Voo;E8grvl$E9*!mES*gjiq&zCQo=BhZy6b z>$Vk49@tlzv-cfRVii{GZ*T(K?~FLrv27jqndjYQ*MYGH5CZI|>kR9q*+^b#d(`<} z8wToI3!UN!^h-?`>RW^6md!4$aH;+6pVp)pWMJF`@88}u=u$8rtkkNM8?Re?_I}hk zT@SZ>h`0v%mBV9sf|8Dk(?1~(Ub$s6toiZs8N4>4b3}}sAdASIPUjiFl!ZulPcW5$ zT1}tEGTW~PCN8ws4r9r=B!*wS+s#l9jm#-=5Z%ks^@L4nwy%_1@I$7pX^9@)l)!z7 zo*Y6>SBrl^;6k$a%HWaqpq`UrWvTf|5z-cJZNOeaq~vW~eWMs(7#i+U0Y1Uq-XIQt z!ZuR0VLYZ656_aej$mZCB#X{`DW{hdLUKrSm~3XVXR2tSvL%Smk7Q%(G^qPh8WK4_ z_#MyI>@$4wRKqLLks4EZ0e-YPnEQNt5gZ$b>GXp4*)HdSKU+bv1-I$V$pp}}iDu!% z%F^<<-n%Nor4+EBqvH-vX?7PPO#YCnl}V#isxwn@-G6?%zuaH73T%I41 zG-kFfc9*%(08_#5F2R7Q&U6x^dZJ}R*2)}nfT`3oZ#`3Y@_bR4o`1ihXvB7?9ajCg0NSqcFf@%vuTA(KX*~PYYNTT_X7czjnRI<$>Rd#+aAbGCl?lQTZ*6 zk4VH-Fz=bkm`a69C7yPE8n;+p{*+C}ZaGa3-H^S@9!f2rSnD7j!{J}i%=Gz$fdxzp zX;>RmX*{3TFVu@5e=64Ue2v*kJrff7MVAb%DT1UX>uSFfmGg5h?Fl(T>gu2+0q z#1PhwNZuR+;bgrCjzNZDB5PfL^+J3?RzLMeS!6_Bc3iT_lk<}85yJPeCJu7tv(blN zCfcB*`V*{b6D4(o^0GZ|nC&V|T!W}yK;T*bDN4hcZ z7@p-0E+YEgfDD)E1*915sGv>MH?a^j>je;qHqt7}PhCE|_z-tfXtqlRP#hkl{AP4J zF(z=BLjRM6G$y#u>@K4Psno{73kcdmSkTt`c;CFy{sSQ5X13kS9YXF0tl|A+Ny^&P zTi^+I|AZuUEbTlu)lN-Iu>kX$BuI2z$%t)ravluCoT|4s@yDI2UDf8eqVL}`maGmQ zU2{zv-?Mg6Eloh{bhXnet}BjkkyembDH05DSI_jcZ1((dTyDv|d1-O^#~v`J{i zQWiW)iNPkc?*BdB@Ut6Cq*o?Iu~@bL2@aw0GPet6T5uqwZWfux@@lkQ%VnL?ONeD$ zbTOQ7If4;fgA5rxEchy*al#13GixYngP}AIzvCrOAfKCsCJv>#wH7j>pwZwdpQ1q= zf3~LZt?lai9z4fMa4!3WuL&Ebc3g@&?0#!(-Quj!~Wx749g((rV`~ zf*~DM@7o-(LXbr7#lvrv%EjyD8l-F{-i#&B!=;!X8u0{A51ylhD_RVna;F*-5}cZH zcyS1Sy-M&kdsua5dTZxRT?4^XOteHe4{ZjXx3>Bq;Xp0q+BMEBLDwxoPhl%f&`J(D zcu-9)K$pk0H%6lrfB(#SO&ym|(B$7u%et9G zGk+307=QkeviJJ7(KYE~hfB=6^@4}57iP6u2+Z&T5twM zeLRJRujNRF(5WCL=kBn8g?#z=>coki*;dzNt-$4CvvS;HZddYap4NSK-l3+X>0cWH zaigc)PPSWL)FZ z9K4*2Nxv~qT=V$rm{ZJ&Dqpq24BT^%4d$)>ij{x{CCO=EF-qnB&Tl1UBT^66`;E&N zb7$;mSErusyqHGZrW`Rv{d(k5&Z(`+>5vrqornmS{`V?JRUHTLgEG5*So*0oh|$OrbJ_u{eW))!`XYbeNGS!6hfr{Yw)v0+&=c>y6) z(o()_607c-`9<_|-A zuj@)J+I+uDbMP+|21Zb06;VqLukg8E0>im@_XpPcsut3dsU_!^O2*!20q@DZTJFDk zA3nCD^epLUF+GDMXIbrd9?Bm0Vj)P8qP1mDo+Bhn-x)Cf7Mr3#fW)1Jei<` zaE+LwGZO%xtLe{$VQz}$5V@%pbla)fEqM24ZZa?$TBLT}or$9a8rL@GuWp0_REEzhZvOBs!UkBC#OF$V;NBA2_70UUk|V zM82+-U$t@yb#{-BW zBiDSt@K`On%9bubWE!32{LB#uw$Dpo^RZO=tC!NNl@cjftcCYH)rQIV#D}*M*dE9A zWcNcA$$Kl6=LC+$yZ_FDldVu=DRxm0)XVdm{uq0}Vz}Qf6>X(TCuo%ax{G?BMPX#< z+R=75GD}yfUR;0|1qHT_TG27Agl^arjC|;b4^aVG+Rv)elBDD2J;s=Jg5jRYN!dy1 zg-Yeq^~e)DR2Y|t#EJ!zFH42u)wm#hcreZv9@7{71_Mz+zwzUE&J@-LJZK=J;b6(5UZ2uBH6u9#t5YTAfegLaWy+^86oGaJ1i*W6ky9&)E|*g>CljUGSrC z^=A?LMB;Wm3wLV9I|;%Rjy9+JN(2eY!luklhE6<42+VrI9p372u?LJMO#PjFR0RNBdq$cy7cI$YnlK|YfV&1@Fcwgl}MBI>h_K| zhVL{{N-L&|)Iw%=13^UC^Bj^_Zug3Q3QR{IP!_m}FmqpB;LEbSC;M%JjC0gB3{H zAK%Tw?(k=F$r1mma;S<<&C@s)@*?Ag&0^yif;#wIG{wUTp#?09$Ia;z#$-v=zUCUh z>aDTG@uZtbl3B<*gscEFjqNvEtfV6VtEa`X&vRUxC?`AI3g52Fsi}_?193|oK3{%& zI2mC#nto^dr~4l6&|ok)Z<>oZE+Uu{H(|&45y~2b8qMZiJuyQ>lT|1c=i$gb!(qKu zvB}X+&zd+~N`mMDf6>Fa@E7D?$FM8G?gl0Zi8rGAw*~WW(W)n2-qT3eEFRwz;?cU-4#-jQ_1IXkiIL;U z#<15fyIfd7I|ZF$bTy}6&piLpj?qM5vs^S{CVJ2vDw{(gq10vVX8Y+%R!b;zu}zlU zeHbzR9#Mz)v2T#Vn*Vf@$W45uHR4UB36KfG=lix;&1f~s<4#*IY|4l;dbsr4<)&^y zsFy&%Nnwq^nUdeLXCg*TAQQ5gFOxBfgf0{KHDYfu?-ZJqn6E#|%){GLhM0?jv zOxJ-RG7!s?2#hCa_H?jX&^mLo^T>s^YJY9%eOOBCbHY%#*6NRppyqCsr66^QS$Da_ zys}vGYktbNvIW1`;n~(}OS1OFaUi3fi-eKr3f1kFbD!aLtD zl`&NuF1GbUg>U;04uNpK-!5=3q$%a((3BFx=Yf+Np2S2h9Z;bg)u71{>x2+Evt6oT%0##M?B*({4Y_}mI!+a85{il- zw{Nt~`Z5jQFsh;C$z0f1xq|UPmtK_q-j)E(a$c%*!`E4Md@!eh_Zz8zPU+kT;Y^F| zYMEoQ>*-w2pS*Dgm}=!YYe>zhw~yba(y+G6^M@7NP}DPcpkS`k)}bjezn_4^EEAFZ z9=$5q{7t*B7xBf`9p|>;<_Q|oMJcrkEB2lZ^?G4Q9bXa4T~}hm#OL+Q6 zCUVUY3ovgxnRa;U`teyG6h3WqRxVS2ic7(CSW~|N&Xmw#pU2ncbP_%8onr0V2)it$ zSoTyh{&r0E?Cbl0PHsFYa=e&77I!=Pt%t8KXnVhq56Vnxg`CXRK}yEtCn2O4#GO`W zl@ogmF`x=H)3Nm!kBITbal#M(q7Fx4U_@0fK)fRZ z+uc?5Xvqo8C27qCyGqn~<-B!S7|L0FV8N*4ef$UtTd4o>q;ciQ8Fshy=R{kTmd#^+ zv5-2D`BkvSP`vt%3okthVl;L;8@SX^x&dU=cvRF6gf-UcoP$ZwByipnK2NLy(VyZk z6IqG?;rQ5)x7~1~nm<40?+twbhQ6(gRmb@ruMyz4ToySsa#rekI^?7P@J?GUxbGk6 zGm6SRc+qVSLyvU6USXFN7Ta-F?)PqOfUAXy@GERlbJS;tzI0qFUYMoiv?iA;DW0vw zjX#C<0b1>uWvd96i(>jvm2G~!lK?GDZNhwV?5w`-p;D8|~m4#pzqUA$LP@2s7!A=53P z)j>=|^_wQ=i9zfL$kLq>97DW(E8!6azP$Hlf6F&H>&l1~D$5@lsnT znd+x}PKR^fw{vEPJfB%A5LnNf?*4EACPaOizQ*c#`I{(GVT8jz7ZyA|Hm8dD5(6_K zZgW-*AA8dm>TQc9W+ydhk#zj7H#?A<;`j^5L5?f_bdb~iU#lLseUIdLanECi_NCjkoBY+6OX2u_{${0K1-VzRZS>PwG|^ zxH`|L=w-~QmML?(uNGHJjgE;23)RRM#AhAt3~3RK(x@IjEXS;R^xgbCgEY*Y=jhZt zx@9bD$Gz;@d0-O74*hmY)ZS|4m_MvJ${RNa@Pyr>5rKq!4<>6esKt$G@h=a6x1sIc z(YI1mILxo0)Fx}h_ap`_LKccjHU5fAnnUWb1nXON!1(g-UQK7v3Yi5;uc;Sw2SHA3 z7=niH9B3zTRD1fZuNnlc7^yam@kqp&7aZ6XtHkijwK}e$tau?=)zf3w!_lOusu8l^ z!ECO!mJj09<}@o+)lzc2PHuto^pl&F8aP5-l76OS!nKd0C@(Qju^c|plXEQ5Nl)Q? ze4h5>|GlNDh;f!SBiA`-UF>YZUF##zvs4NmC+WcR-M9-l?X{UhSj}?VcRYVKB#v>l zz;SG&+fu)-&@dIL_AmJzHHYk4L9!yqQ~dMxkL;Bf`;mHa-Q=i4gdm{5EWlP&1+IIgbs(8)9%F_PQ-PM6wp6&JAJwELr{WiDOY9W*{ zllcqRsfn|6yZtvaHDsUE>%PxeBfF22>Fm$?{pen8@~W)e1Dt@nRd0n)7SYyJ-FKq|XntLt8&54d=qN`Y zoL$=b@tAv!9HYDFeBpc7np|0!f09b_=moYT8Eof!+|Md5Xw0tA7$dJ2wxq=rjFNS9 ztKne-892-A7RjQI@kcl}f+~K@RaR9?4xcWN$NhoOjT2DD)y0UwP^jSqNmE!K#)G$gafTS8kZW>|{CJ@_h8Vx|z&2 zoxJz~uJ~v-%ggBrS9wI@KYJHs2GsJ}?erRXm~m~6#ADE^b9YEh(Ri~!pZb8NZ%nk+ zMptM&A%+*Lz^_;Iva679_%>5w(uRQ%=9!-JraY-y^@P`poPifj{M?nc_NUl*!mWC2 zfqp@jsK@uatI8jt9^8Cicr^@=5p!xEK3xh(>I^^hc)D}*IAY}`wnd_M7u&ATV|+ZZ zW#E+iVq;7CUFLIQ9IxT%X(kL0(&pB#YGD+?(%&l$#CQj3hLD1Mpg^-zJ6(sAt~iY2 zPCz{Z^D1czUf$J_`gvnR91cy=A46pskalswk=jibS3~)0^cI8kCjqkxg~1_-#hRuD zdwM&Kv0+=y(2&|ep2Yc61-W6Bd@`cBO66!InsEzy569lPU?8j0l?Okh`LuFrrZS}N z&nSAQvrfwE$>yy(>)R#7F`DYx)LYv%+kbq^b2pWt(MHYyIs7xu%>gjh48UhXiaU*R zwEa`3dyJQojgOWsb-YJ81)Fa4p16{a^BNE7JX(?oAm|~^fI0BDTOCx!v!L0gFxvLK zPAd~=L-@ki?338>{>Gi+OiQji7mh#-rb9Ifr86`ZT%U(5JaCn^Hg~oY!dT^3cV zgMd2KiP28ei)#D99~6qNu&>;ixtCGjgud$G>2o^Kv{uPw+b&1O%UAg}oHWwtcYJ;4 z%45%GvsYr5Qi={=ZPhQ=eoq$fem1`)+`ur5dXbX6Tx3Em36J8*Vk?XqDn?3 zqignP^~IxpcY=H*xRb zhh^%9N}-)S_|g_QTfF03{Nt$+5l+^sUHs>ja?KMX0zzOUaJqwq>+-0d^r5Z zYr!so%XgNVCoiMH{l%)ETb{c!|H#uPqd`eZ^PTtabFLj67dTo8dTK_^0JKNCHV^)s zALfWvt(-&O=(h6+V)FmG;k`!C9g0}T4sa}e*z&TVEgK1=49u8In=56JaoRqezTi{H zf!2f-^;?r*MT9B={wmw{H9n{E7zi>6=wbWO@MKx_ZuXGot@H?n&SU&+cC!HKUrtWw zmz5ho6tKo=y({Ht6_UKEFj&;EmJ4MtHF@I!W{dR~B<%6>#~wrmA+0jT1I)K~<+Yl) zj(4oL^%{Iht4N&XKe-<*&Gz3|L}y07T8~^oa)AV~@Rm0ulcGK{zf3<{(=R4LP7XD0 zX2;J=39*&(W*8Pct`7^$r8a0XCY772R7iH99hRDq|0>9r0*f3`?3 zmLH$&YyD#{DRC0%?{el9b7II(41_GheB*eACl;oBs1+dYVcw8xXpy_Isw`|r>>}uK zdb@@5q_{_`3VUIOr$jtK64R*uVVT{^U=7{W|0sH#9BI2uNU4uF1o_^(DjZ0FI)($I ztutCYbNolI!jx6}t=md4_HHj53xOGuBK?;pUAz2bvhB3Nw^G!g=r*Sw-ZpF-mWlv%`0Lzm>at6|5qE|W zfxW3OU7d|eKr|G+O3sC>h1hg1o_2!blYXyDV0|1CXU-GER*?4^e!_MOPLlyoRcAC@ zliR)tx8n`uSC_HWfPf6P4v(*YuSsxS{oYE4uizoTjW?>>!a zd;fPWnRkRcf@E76m#fxhhlfIAN(g8^r`JWh&U$U(RGuzHV$)|`UGw~T1JJqb`RZC2 z)5>rBQ%>6z7|eHc-;AK|)9j$x?;Bl4fJxZ?;(T!Mvno9%zuc>}0)Iu2Q=q&z(-t0D zv0sgpdSP(kp)i|K_FUIzAdD>QkEmR#w>24{;e)r5fbBPhi-AN8rvFK-osnE+uzrj7 z*JHy$aYB-ZdxcvT9DZO4NxcjXZUIGz``j+6ymgF?)^N??{pxxAvO*>COjUc%*)2V3 z`8b4lrn&Ez@%-yC(M0AA}AJz~&xtGv%off`A zXcysLf6PBaN+)xvI3`1wg2nIjeb%>=1UcGt(N>Xw9_N?KEReifX~q&o8c0CO{sz2k;=RO%4PF{GxL9_bzAw9@& z<&I)<{7&8Ro0IGzLC~3CG_N1?v6>?l9c*6QnO44g;Ni~8u$MHfXol(Q5*$CDf$!sP z58np!w9}>3ug^0b(#x&Min@Lc-fx!SQkvb+Ch+=LuT4eVP=1|&AD8pV##a>M)d)TY zlGWJWmUQw-4CS(hl99kueud%D`nWK9&z8pOy(ag&!IFCuRhbXV9EbB|RVwoMXa>3} zOI25YFdV}7!G(G%hC%ya&bMAr4BV$u8;4 zSE|l+{<>SK46VD|Nl`h7dQa4@D}w}gpA%5bB!nUvtU5h!eVeNw!$>D>Hx(`4&#c#s z7javwpL_{zOZHJegscqVSZ#0Lk?EW5;10|l8t`!e)WE!OjQ_?9-)#RQO1hxkB>Do> zb=fRS?7ZQSOAlGVBjmN5Uxe0ps#o^KhkyIxH_@q(^O7 z+`RaN){e%Fj#BACzX6k{Rx!k3d<;v`Xake9KKE^0lF;<9)s(Mq0A!PsRYo<{4faY5h<>Xe<3G&iR=FWHsoc_z8l&O>7rTYhH!-u8 zYV4ix^VBFbN~M+7K_8&pn$@C&WsBIXq@=8FI2X~ycACS!aVk6gxZ6Ec?DQ=H4>6X z!=@$yeS1~3Nj1ae|Ce&1>S z$^?=6@L0)f$KJd8enj_e(7MRY>RGl+G2| zLJxY#(fDe5?VG4HAN^_NBxhZE7Vsg}L1cK%^1>GzIAgc5_Sb7}*9=FN`E>s9f6BBE z_XaAKue*1tKa7j)5&&4(Uk;72fWwX9ADhD~zUaN;yPdqz%1g3=lz$*}kG%K93hE7l zxsod{a)c3Kfh4Xy#CuYy)QDp3fc8gX${tFHq`95dTjb-`>_>#y&a8IMOlZ96k=A8 zz{>Zc>U*;8$}Pe#D`or5czSn3sOgTrc)|j{LeQK3lEV@3##(h|6C?ZZv@74U0f-$~dAI@q4ZX$zk2>qreS3wStO#bVf@ zK~c89&@Z<>cR+_bVC9x=Ts_(iw5NUVv|Xm%S!u01=`^~2YuN(FxNCcL7i+PLCICh0xtN~tlG2gnX%}&e1cV@D$zwNh=O(cy^DnFd7B|8 z>)_1l9zg8xfkmrZpj9XQ^KT_fFvD8l;dcEr{O69k^@Z8hdE%Tb$oS~So8G9?_7GKl z<1?m`_0O%tGn?)xcXN7G)FOa|_Mo+8Cnt%Tl2Eq$2cdUlf-Ohh=S$>!(!VrXY zz20sFo8Mri+Lm*|eNzhiG%M00*-JqEO=B3)NUy{#zAND(paogij8YY&Q=RFlf~bkK z1&|H&N4S|7Efq9x6T6hR=_RBkyHOvyxCTtkiB6ULO3e!XTjM?2vK$H!65#Ub3Xl6( zD6@BJ@uO3hHoF7eq)OC>^te|MQpI$e`FW34jrvqFsR3lCGP#w-^?8SOMkJ5fmu_p* z2+-qWRtr>iz(B-TH!>8SWo5BkOn-`C>e}{^vI=DU(YLv{>Hxaiwo+0+twQF+`PPbDI zKMz;Iien&3S0KqtGfYe|sIThtSDRe`vdE#YHP;?G&%7p+%q4g;q%bh7`s0q*Q>_u< z0zTrO2a=3I!S3A}68NEMaQ5BR@4dlLTkhSn6W-GAK0OH<2Ru+04egLI`aA~?dTAJo z7Kn7JeihwZH9DPxQq85H*79*9OQO}W&gU@PSC#lr)17~;$4C1^sjvSy*qz=%NwU!X z;>!KCzWLdUmuTWnJVDbt50m4NfJYA*7pQVIwE?ukQ#QQJZcZ#=2gXcD0WxpLGnS>` zLq-g!FjOAVKGLCrD*VciVR~~>p>q^hM~|&X7F4u+Zz6KPa#F7XF4~zwIy8MwZ%fx6 z@kIq>rHEQ~DC$m`Ky>-Bclv$IJj8^F89Zd`#R2lqf;G9&NtUxu*ZT>H%=0Qz{94H2 znp7Asp$$L+l^74{Z}@}lU!%JmFr>YY2>*RAA^3;MUp~d}LL;h3A9aUoe9?^3{hy)J zDgLp$!{K*1{nZHiUf3N*$4!u4l1~&6c{5w$>9VohA`-fCgQT6}K(0*i)6WYjSqcKr zxy%-htM#2!>%L`eDWRv`e;0!F0&BGrV3y{#*YNajt^|iZ`?L4Iksc{!6DaxU`>rF? zot$T6AXUTuW#YA`D>N}7(ZKByo&wVOdc#`_7>*8G_;VDK;>M`H_28os3MgzqX*d(TpXa^b@0`!S zP=BzmYpuEF9CM5@7fta7#qQ<%sG22f%NX6>(l>^-4NDi{r_>6*MQD`F-`)G$b7B3N z-*Fd)XrdOzUO1G|NL1&VI;@VbJTkOG7gzV7`|a5hZkXOE(C^Oh^>UexEq-|VEz=Nx zpl<~I(Kle)gYEQciu#>P|IX<2(~=RVgCMnnf9KTxvMun9LDZ+bOUTxdlW*#^H7MF_g(A3py^d=g_Bv_Es9SLF(w=w9F&4DIIdyW zy=eFFq!mLD(9kr^d00H$bO|2m?=QI|xZA7jYb zZ^dV)(>ikfkEuZcCa5wUvz2~*woZJiLNA!`ivCKmX25YKn^wUvc})xpQC}F1*`=!q ziBS&S<680qW${9O;cQ{|GF>DxCF{NR`9I5tPCNgc#O&;x44drbeiIV580=x!P~v@p zr1Md`saLw#yWDr*&c5&+bGTug7tKAkYm+rpdUeNdH#Rox7HXLaF8*1bKLa<6uIw{UYe4`NGqzcm=^Rv%}n&k$jA zZktbgtK^fo{Z5MPh<;65tl69O(Brbd(K-t$3WeO*T(RBxIh7z_>A6~RsCLCi;-yiz zT(GHuW&`W%2&=IY%Z+a?90%lYWc{~^rLkmW=rnTsx5i4%YHMp}su0Kcr;p_i0SzN} z!hidnV)RsvTT>_%Sv-NwalLolOyyRYX_uW@_hbT3+Q^5yO}8HG>G5J}JzCwi?|*DTJd54jef^W^iordb)?Z=dmp&i|t;4`u)tu z)F(f);^onIJl!jrgS}2Vd|aIuoeVf6HVe-Je@pin?wuzkdC?HJomwKzpRG zk8sSf{oQ(Pzv=h>OznynS8;F-Mg|=8tDUUtU!5f-C2d=YjErPeN)oc0Z|kx9|9Kw! zV8NFdbaH#7Mp~WcL4p2KH)n{;uggId^jtrq!`)6`IagmS!=OXFZLi@Rqwg|{t-IapXOV)5+@nS6~pe;Kbco^NTg(jE&7tD&L6 za^#a-7!46oiOozc7V&j4_g#WatEsB;2iI4}%U3sMo5HDuiO9%C>by=;?zl0wrieJM zz&kAUq}q(?ArOe6yeD)Lo>p-S)$iZGf7bpk@wWBKbj?9F9S*@d{^qE2hO=w=)ZjS*5#t!vzR?yX@oem``p9CG39 zAv4%JImy#Kx%p(O%CT>xHI!=Sd+RyW!TG&J9u2Xb&_R@TNWaeY@RB?|W686RU zd54o@4_PYVz4 z8OT=dh~*Cdu8r*TdpqI$y?>#`pHMRx^IfBxYlYv->;;E?_P@YpHR!p?3B*vUlsWL?%XG9prif!)19hp1B+ zR_lkTOnkQ9+gz|5{wS5_ zKHGTl!!iUJHSaUUkx#mXI+Z>|Icj+qaY)5H_RAE)Y5ly>%d4!)DM**C#fcF=h^2gZ z@!{cN#XkKS*K&x{YqO=)ILy<;tiLtOo@afpTou3YJ%-cU_i~T%SgR{0OM)Saho8Uf zUNkLafIojEXkmhQOJF;ay8KmrIj(lIIvdH`84-mu8+`yxw6!N3msKoRhbdfKHoa=( z0%gnGcE+T97i&B{##>r^(vNE>urC~%^%z+WuN|l(x3;#J7#Z`PaCtrE65!HqK7)fH_Y~4Ud?>P+*>l-)VrJRHy1=`)7hN~U z$;x`rN>}LmYyS-@k^YJg{S|a;#&Ce3)ZrQTSj0%ah6LA7V@I>f3!0cuL-ju(&Y8m% zR^B3rFGvwwkUB)bSF%GGU9@g?SWIqiU*j9%{(;haxb+KmyY3XU;TkF`Y}XcJ5u6Z!7W`4?2SH4!F zXjQU4T_eDMxYtTxL#tf$EO4qN)RM)$rd*{7Piv#25+tZ!;59-FFR)j2h=KdDUyd(xnL+1pj=(m39Wq^?*E?JT#}M zpqv`1#AK$6ysHmVyeL0kNk;g$qfwuDGm*Uo5m64_b|>CYSy#s?y0U$Iv-#2EBW2K^sKDwDdCJ~ z&+ra^w$@HlAtxkEd;gw(s)`=-iJp1-Ph1kblR-Zuc;a8yOm(H!{qE}mVUckTO&+-q zU2<$1$i0^@F~3%RZS6>m7EE}_R(yG)5E7S?ob0ynHNs0=PtPG|FGC?55LHA{(#M>f zfbfWj2&g5k3hwzFddSfhm~;9$r4UB#o$9r}ZNBlX0m`H0G;$A)qoDQpg@)n&exDdO z7nfu)m;S-Q!JZ!ce$Q|9XK`?FuGetu*M?9F^Fpxm^Q-CU=~X#Op>?Gckyz=M-lgiR zv>mKwkJM!?=eYE2!ucX-{=4ntXwJl_C(bFJYA`}^G~cSHpQegWEn;9s4r+zO)+l@F&a$j$wZMnm4_f__?1T&#(&50+oO|1+vi-OPJug@08cRV2ww#Rf->R=X0ug!;3 zS27C-()eLd%AMl&-djtprkE*6TU2aN6h478~9=J(l-TM97fqlkGHy$LrN8;u3$n|Cuhq`NNT&*WDoVb+c zR(YA4Q^Je^>+-JXu7ls4nX1{C+1d5$4|d}_7KZY(YY@dXB_)qXUDma(>q z87?dw`110|A77uG;(KMajoPC}$JuvdSf=mZy}P?UH)m{O(o7h*GhS|4sq?e(4bG3X zH8C+UBog`H!2^JCrk!!O?B*4Xjg5D2``p?o)35U^NK5k`#l^+NqY*(}zl+xWahjB` zqR>Y310O#6NQXIku&jGt%@-AgfRf!CWW*?~|0Kywr;m(b!7VYj$YWWEkGYbL{Wcwu zot0Ht(9VHGcz+oE5bx&YHF(aO3}^LwL4Wp0y+S|F-6PYzI`KPl6OBo+=DlR19in| zzAfTWhWy2g7r(Er&eVCqS^7>$NSKh208l{n$&+S1t9RK-$v!b&yBpruHS+&PnKYd1 z3y)TFO@X^ffn2y@Q2hm<4oC!~eXOf)L87`_50@nOYATIfKwr zKD?j-z-f2feoNNHa_i%Zk(>>?z^T(?5aqLFDzrCzE(zPOKWR( zbtDUegoM7lynLe~^I3F_?X0HRQZ!M}PMQpBmXfZ%COOCc4Ot>gjN7r;C<+w=^*X@r zPfE^l{7)G!$j^_Bi0CiNeWaIC)PG}h56P|n82XICs`6XmtvB9E;<|{vfSzl0C{d+h(uo5YZQ|Ni~!*RMm#I6m~pB!AzjMy8AP z;mW~;jhj$X|KZ!f?2S4vmUP6z*Glj6OV>zi39broA1n$hZ^!E2vnolI@c+KNy>|^#Zo?iFF0bq!wH#mtdVib3OB5*GF6cxoa2RomFh(}z^(ij6k{bPC&MPME-#allYg;aQo4V?<7W3P)B>BSsy8Q3pFR~45lI$x zriJFVw${@zcCwB#P34PmE8bwoJC^%-$5HghSO2^Q!%+Lsvs(8(5$o}+%=@fmHZxVD zxbtwel+F+CRlUzvg0b3P44ZWNv>Sp|9PG-qu7Wyi*%_!O(4?6|9yBsSL!)f){T;J^?RGF3_v6Qp zA$_PldD4Lw!o62%zqG%<&r1hg6CWR6F+CHQ1~2gG-@r()NJ!8<@q1?j=RtIyft%ZA zw8*d%=jnCj6rCxTC|)S-&5nI#=Bm27Q!Cvab!7j+0oY+-`CcbS$insY8aHiQA>qsq zPhVoxzu3-xZHoxLbEo3)&|c76Qe~}n$2r*9&TbPLkG3Z2 zGCplwVj>^=8|m4&TgF^EUu?IR`=BUlIMZfg;aTr)%)*h6IW)U2O>|n|R^I+xMd8tG zCW(7QhWCiyzi%Jdto`%&NWtTv+s-NF*L-{G{g{xG@7Y<+Et&sCT6M?!>{XPV^Yc)M zPB{d@bj1$`JL?iIoAW;=DojjF*forRHd_v6zJ6UxQ2dV9Bn=+YR!01M{21;b+!8?Fb+x>JXZZwiQJ zFr@SI&&tgChQH>`O;#FLJKBG7bFM3XIoFR8<>4uU1kbYqN%b3*J9LIsRaM5VVMBw1 z^N0572{WDtKY%z{ezvi92%GFJS$`aqnvkINOiWK*JqC#s^2etG-Z-R1Sx(Lb zU=|bv>6`s|I7I8%rKUGPZCc0G~mior1=# z><=F%Ydu88#c4wkfHALLy&BIWv;xgC13rNov#wJ+-_BbomEk7T8I_flett0A_h@f- z_q`$~2Zxc7(YbTyYCR6J3JYz_&D-%rBxxI)noOQO6LDDX{iIuszI)ngLx7u`o41+8 za_NS^0qn%;$HvB%E`kb%emgcmFDMwB%WGz~ghHWM?#H~p@4HFO|D52fArxx@n{8Vw zYwL)sQQELRHLkxt78LZ!F9uZ$ca4*B^J_9+a=yk!q7?~hh+jHfWalZ zG#A$v*4Ix`PY(UDuGb##Z7s#e#|ya-V0BwN5vPmTFX9rxJ|{sO@E&e1+*jid!ZW); zyx0;-#a0ISHe<3SgaVM!8zVX~1P>k(tstxTMvHlxVv1WXrvp=Gg5xVsnP_oC-8t+E zx;o+~jZGSlhU!57fPLaKbXP}vdkT-?Lt9%-$@*DA8>7~cr;AHFZjskR?-AQL#yiP5Jw`jZgunG?TC$vyc#Z@+$@zKgq^EKFRY%<^$Ku z1R?oD4nZO?jQfnw6<J1@uTtGsJcTdpS0HH* zO1>+7BmL;IQdsMg$ToWP$Ts7y=gLi=af>#r<$8HQ5?-J zEsVOOoBf&h$7Zq$_3MCHBd4M1a+h5VjBx;%-zHXii8I&N_q-b2W(bXh$3E(#AP>(M zhwnsasl}HeJZ7dl`d&wlP)tIqm{NOruJu6-Eq2DAzhnGc?)+80*QQLSHa0ekHyn68 z4-tZbf|@S__jWcm;skA`Arp@ZK#(X}HonH{Y=tAoOH6R=np-Xfr-(Xpi4pU5e)M+BD4ktdx%sKE=qm^dO?+btxYw>-%X$#+ zllMBo$gw{_W8>RdhgiaOIMEqgkkDF{h+jRHehm%(jh#uEL!mPJrS4l> z1!y)EZnnFFjlj+KwHtQdv9`7jTXx2FbaZsm-%hrD+|cr9x08}w)Go}zC@-8T0mq>K z!`&Qi>TAT&DJgoMp5iN%V=Y)oiHRL*+v#$_w!FrbupMhj=QxC@qns-JWfWtzn5F}t zN~hwR+wmgT4t>c>2=yuP0jlIO-)VECqB2a!MDf3!*jB;^VyKZ zDc+oI@_#=5{sBQZw7M166QnI~)JxjtG&W{|f`wYvz{5Tr17I1K;?++26g7{*nIvp^ ze>H>SvuDqCl7ENP(B$9up%MLFQ`7IARHv;@OiTVq8{#X{M9RP;nf{UbxWhwcY$mOqky1 zKW{Iq`o)--l)T2D6B@jU@$vNf*%m-?O|@{8!vg{&S8Pq=nD_CQ8R&068$zL8-#v$o z{X@a@EsL(ME z0`BqNlv;$!Q4KuFBL9oN^oz&F6tFLNaKya5z1#RyZ)9aRat`; zMkcBGvI)V7r9~)$7={1Jp;?rhTjObEe#MTB$G$_XB(n1PYiXH`zE1#!W4W1PxTc`} zPs@7P0kk03bZ1Y+n#XwnVP(Am1U1AQ8PUxncf@IhY|ki#l{p1T8o$!JsO8F!#bfL6 zvKN4FS==GUA4*M5E_7@3qlgN8Gpk#vSg`caE}9V0{96&MT%Yyf4G8$h+WiJ*s2N7^j} z<>#}z2kN$mD4JPE654Ros{uXQ{raw=qCrdi-1{)vJ71?IQiWbJigRgy4xx~02@w2x zA*L;dQh|RGH)Rt)Gj4E;A5kG4FzDv3r5^u<67~pX*$(;c zfQs~V#>4oUsi2$7gSo^C%$ByN0rked|GC3GF5-=?O9XOZ?FLxrQ}AjVy4A1yX!n9j)s zKCYOlDh<=7z0{p-xMng!)_wVc?|mlbgJq$|f_!`jyBI)M_8^}%nUs3#g__h!mHvsmyw(qxWip{cuv>sd{bnifTiE1D-UVehXA0! zmdck3AT|uFPSwECVy0_hzpBuvmX`E(U;C1#4;0^w-^y>d#NSYgOTF~XoaYm;;J|;> z(?aswbr9oCq5TyAIFo zQG>d%+4&PbIl$+kq9Qbic*fswmzlp*Rfz}-{}?OvKhl(4JDW#jgnvuk`5fJBe9|hT7l;l7C~!);OkNPFu&*_bXNLclJdN-|Z! z{f_FI)e6ZqoB6A5X|xe^w>zQ6+P*O6c4*oD_3I_&+~I?}ccqr34s(M-sRX(7Ym;hP z!y+%UV%@y9bh~?Fy)y2K`6*(4;t!-7(_<5NeY5BnQ%UPC$>iq`?|C=1bKH3L2=n{b zuO9>1_>}jpt$zRjcev)@=-87a?5j@7X=z|Uzs1}6m&SQd zPK_(cOHwS*N086^7*WF+3i!)+##I6sOR+3KO2jMl4N{PkqaY{eL5K1{K(*YvPEJkq z3E;j$`YoQTJ0I>wPEJk=osg_M{TNT99`!hTNI}WrxkUHvOd4Ei%J$;riEL!3v}EpW zi@_@o-65k+RM>F8h)y~Q+Cc>pH~W8(7cttKQ{CNoX3fA< z+tTu)Lz|?Q(!fMxg&U%pU- zbth3M-48n%q(92{nckk6uKYrYK z-b>FP3s@Un$f%HbWnHc#I7?MS10bJalV4hciH&S2`fAT>xzYo@I?oyqG30(CWU?RO zp#_U6FkG8LJCJ|A^=AeODmZ9(0WM;@aD`{7@I{CPbU^B#blQ~)y8?DzV-cfR?Y$99 z`5}nl8I0*qT0V3rk$gV`149nSGflBa{0VHzQB8@0HY&WDe%__Z(WUkA6XllKg@q?8 zL+lS9c9k?&x2>Xr0t7#Lg2e(I!SoC8{30t6K0ZD*jc6+Md37UI78KRxa+?|PYeMc1 z)h{TKm~Nv0QfOl2#n>)mu%;<6rk5)s5hDbD=!RWAOo~|LOn0&y^R;!$k?yPHJk>=T zEdCkS(}4N*Gq==^#(=^pMVG!SS)O+RU1PU^8p;>E$2Og%(;jRAD2(oAig$6ZRc7dc-qHY|T7uo)Fi=&rDtY0Lnk z`H@yPsL}v#dIIR|`QWTj2R&(N7^cRULU&3NM3@<5C6ia|SH5sB*jY&l@16XCrVf%K zV|ZXdX3VZq$27q6CC1MnADjuqkd#jk>ooYRoZZ1P=Ecl=J>l{ijR zE&^c4ewf%o*p>JM96teq3C%!eH5o$-ey?K)T5PAElYwWhs2jb?$zcS7 z{@Oh~E2t;8I|S#IyWg8PX^chgTZ>%~)#Ib10hdn`aDVela2v@%C^;`PWO|EI7-0Mn7 z2s8={L$Mo>cgo1-g{>3W(9r1Ov1ZHo#|R6vNHm6KUG1X!^o{qE5s1$iV`)Xj51A$k z6cl<=Qn?J3I#~ttL7p|73`%P$5fKyw8=D}ZnwXGWy}CDCkhrk8_(X)r`xPA()y}@- zgH-Y4N1xxhCpXTjst%m&&XN3gwSl0=qZRuC8Qsjx3^T=`)XU2DGMJQeW1$D>hy8T~ z5_!MfMQh)=hMz$*geC@hbDasoA=elwiHYYH76hKP5!}|4mA#OF#lph!;Zt_Dh75}8 zV%YCs7cY`(BHobAzQMs~)6Oh`h$d4|>rmb0EhPpGK0wBm_Jqplq!p^DU#UazTqDE| zD$rpQxwp15ln<<;nMEqoRg8Qt7-Y&ph?mp*aooXOw>DE(x4Vi`;aGE6u(}R6a49R( zLDzE=rodu={f=d0fg-kEHFwo^4N`~vQ)q<4Wz>j4rF`V*zOJ%$H*){@Hx*@E)MIoc zp^M#FwWr$}j6oWODF=H?!f)Pi%gS`!udeQRVAYLcVxo-DQ{hk=gIIHL*gqskcgt{xjd*3u;Bx+!^AezQYP_5vjpJ}`Dy(sk6u0`ZmGo17i0MK0PjYSAT#k}kXlyj>m`S8gRkJyQhR4-NLbiyUS9O!4#@!YOupmo3H}5l09vN7*bb=KcLC0tnwroK&vphp19rCa9`o$-a==A(U&1qHY7ZXZ zS|w44H0-Uvy21ekpGj_(i58PPu5v{)P!;2rin)|SmZT(?6~&qt2c5dP%N}>#vyRO# zm3wb~l+8z>vh!oKl)7vv(Uhwpk0(CWCL)mhy z|I6e^;E=_0yYjz0Ez@D{j2T$mpDe=_}p4$0T_1ZrNI&xK&>tgZdzc! zfm`ae!;n2?a1`2rG8{Ju94cVEkO!3H1yfuVkAoD&Me=JZ~vV_)SB_@FH*TOnqS%*C@ zJI7D{=1g6)LevkCF-8qW$vn!xH_XmQy)y;>1D{}a89?k zw=W;PXZeOf4C(%sWB6+qcyzw_3;7=Zjr{j-jdk_gtM!{sIbP(5=$(;ipg=jaJ906vn>7+je>uR6DXp0b2|W3T2ug%6P_5hY1?*wa9#I3YAF1g zbSSJ^)JMoTYke85=;q+&uE?|U*nvK}*&hz?9A3>szL*mEZ3wP04aIACX9G*Lu1viN zZ$_c6wPS5@y$HYnO^WGjZEYPNukt+F2Xz#E&m3spF5e`XzOm>KFUDU`Sn4kY(~(WO zfzwklLU#b{N`Tu+C0KaYC;cp3U0q8I(Q|X!WzW4ci0OV^wGCm{`1HC~+IKAaG9L9P zI4BSZAu5TmiUMy(FIMJI_3%J(76u5SK!# zYe&#Cp$mQ338TbNRJVAdCyET43bZS3-nfxQib@^sZfmPoSj>D;KR9^J-)Z4%H1wju zCp>2tT8f{Uni@S-R_uuP%Q#GC=eqDkG|dg4}+_S?6@u=A#w9&-uw4w##Ob>zvfw= zsq=qbuScG^zl$*j&GgeJuC1_6MrIZkDdZdJG)ej#)$FjUDpw3oz}YGmCMIJ81C`Hd z434r_CS>pXZ&+AZJlE{F5?fEqcH^R>NdU_Sn_lWHt>=jlj0(xiyID;XDt<_f1#7M>#_&>Q5xkPn zP-{w739(`9cK3g=2VH4qKMoVTc4HGpAu0I@@I$Hsf`etgG1lP7;!Ql84)sBx!j-7! zC2rj!>_acxA~P?_^f5?_r-t8@QCy^!R6^lXv)*-Z;8*$pykE4_H#y0EA@Bc_WbxNSZ4L+z-wZBfv}s#S(4!)A+i`ozE1Y#4Z>_< z@u1X|0PIZvbmH(Jrv3Qyt-<{=3qd`r@@iNA6ie7<%n*0Rrx%{Hjq2?`5gp9T?ze9h zzZv+wHdg!6*w^>+omlQvUgMVK{-21;moB9{DPByz zmpWg@Wor(unG4`#9WOR+1p~!mcQOvc&i1zZ$#D%((xs)P1qB5_N%K?ni-TDRL~*5f z-c0Bnfm41)=VD}#qS~nbTZ$uB@;`Q`Kg)Q<+1c5_?Zb2=g9Xjsjn+VJZQ4agSG&5M z3}-7lw_n#FwwhXTS`6v=Y;7MMz0ex2FYrvUCzQ&J#4SVfDLOeS>h+1{K&hFs6)!)( z4w&v?Q6eLWx2PlP>gpuvg+P%>5)&15J9R^SwszZ`ZwEcR_Znnb9@X&W@853~qMuc| zY+WBbf{qC2Q#VjkK0G1&->V03qX||T(&(uQveJ$7DEY6hKV^n|(})!JFnuw-2HJus z;Dwf^eNJtqq&urj>>U+FJWnh8rJg>`Eo&cw?h5viot+)1I)M3L7Y2@92z+Mu8Nn00 zibg~HhCplzn)jm`8qz>J1vU^^v0CplQzuLXXoOH|!Ey+RWNUM? z=&`>333#9Yxg~*$35px60>mC1uTrq>fH;jdmX(!-mN|?@^fvg9K8_3woH_;ppB3w= zs+NMboR^n3Jw5%vm6nzk;v3|7-=T@oQE)9w9sZ&vlK%I;gM;U?nXKV;rbDMjA+na;d`hS&@+g0I^Or+66nPT{X~Ef%hI_a{9|lpta{< zOHY+}hh-IT@Cc>H8g?cM;{ViDQ854~d404S=hHw*Bm{7UVUreC^TB9w2J!WeTpWk0 zMqt4?+85D0{Hc%e)7#&h_;^d$lhc!u=;^<`Nu0=>@zj2w!xo#W z@4`2~Wg_nRK&NIrn#AUATUS@X7rCXSrQo_!wVQ1n>~PBZr#%f!O&5YS^z?oY7qn(8 zP1(MD3dT@u?3*W!HwiYFm?W&m9Zt`8fZcaA35NL=m9CKG`31|PyMN*UoC2pkMl51= zbtE`EJpU(0>?<`?_4p%I)%y>lpRqNZV4qq`zh(9uh3!oYf)9H4EPSC zlnV%L>hQnU^!VEwa6D~n{oEz`{U7kP$AwW>9iZmRYCS&cCLzb+{AT-2Zj2?AC|1IL z5gm7{LsfM>Hzy}Xd@zaCwM;a-v!Nf1HXEdzeqj3HWMUHas_ly5WEW0yz)F8)>v%`N zW8d%aP?A=;-wb@AzM@WLtbbM&TlkHa1f^QrtX<_$G7Z>H)7?SHU^T@$63=Sy*eH2@ z@d#IKXdzSVXB2(jt96eSw>8aLuvb+T{l||RKo<^8#0AWG`X5ibyeA=!s+^sdM+!TBPT|PmOOuhTZ-U_$Z8LN; zw%iiG#$(moYVvIpVaz-AzNcLsETEP$G08l97}`4b#CZ(l2!X=?+q!zd-Su%5pq$UO7XJQSOGfIH_Hs@>vo2}P2IC24ROpj|IY8}MI^Eg> zAIRbQNMY)`caI}zfVW51)PTZu@tpT-xyutvc9oBj14@4n^Pi?4FE>b#Z1W2O zW5cSfSM6Y4ow>lRQTVg0Xqm&6tU`#yC0tN~ZF#835XZw~N5FdggU;zpK%RERPv{hk zjE#X`9EF|-Xn&xgS5{Za#tb3xf{O!8g)T2KErQC;srSX~7dyYaoa53%3K`U&F(3MW zdy7kc{+u4jvPK*qs?D|d>AX1b$Diu#6j8f}yl7}h=y4#jzpJK*Y@V63HZ+WgiMh_r z-9uQp4H&kF=#seGZz~8U@cz);ape~h5|WVc0wo*#&5%ez0RZxS9UiL6dHscr%?VIU z8ijfSR-*(?S5s3ZylSD>LFDzGRt#hPix=O3)BoOoGwrD@_x1eW?0*Nd2#l@6vH=wl z(NK-s0dyVN$})5k;^VoI<_joM{%z}%W4<~KB&eLAIMa}kWoKqSQBnB@-vo3qAWM+T z1i&H*S^v|gPXy~gZk*0WuFj4I1qA{65V(DtR={Eq{7Li^kgvfaL5-xO6^HeM)W*b| z_)J}09WtEOXDf3U0t3E&RX=x-(xBX9TJ@t}H+x>-5 zzxk#&8OMm3SzCv52Sn+=shIdDY{^JN7M+`4TIx@iiwx?q;NqHw-SOkQeBpw__x_8C zlmZrIKrpzt>;dyFVBRkaF7e}!@854s*wl3;i;`s}CnZ@~TH;_yfUOTEDHlJ3-9u{VQI-g>i6daHDLlTB-ey zk9@~h`FjnQg5}q(x@S14xR(8&K1n++1<=|nRjPCDBVxp2 zeu0FOL+fE24-X%owvXa75BHQLF_$0p-V0*174Id}X9O6TBtGhU#r;7D|G0z)DKWKV zvH!Pk8cO=`;77|B)9$}kpti)bT>55L=`OA>?{|5oqk}l+&-jjR&A!2rYel|>b8ZgX z;!>q=#=jMWkT+Ny5-m8Cq@IO}G9z;0Rh}%r8W#&{k*eSM^?pUd(1<@s2={F!DsVJON zep$BQ)Ki&t0N8Z!L}m-VXGVxCWCt>P>T4q#Wd>CLQczCvu5AZQl%Pduwn6!5Jtvt>5&WunuhYWh*}afQv`x|Lqlza6VSu{Bhya`^k8~M@&aQM`~z?j5|Sawd%?h*QEWODBMb*KBWO18E|bGCNH zdG==7e5J#p&Y$*|c-HEg49$*&c3rU#6%soyKQ{(W$x(v04p#%qN3Hnit7^c^74YZi8$IX z2d5IOD-wAK?H@Qyg3gT2udYtyszMHbs%P}M$f5hK{(4)5zBzvzh9b~tjejPGZqqm|_kJa0x)gxY?#=FD)}0b* z4PW1f$)Qxt@132Bk`kZWz|DP|gIxt&GF@&t0yXKWwsv(@)lU%S03U!`(QSYGMhNB9 z4(uX_&X*0S3CpM%FoOVg0^a1c7HB81oK$qh!BE&DX;IZsszGc~I9%T0)~&ksiL1bZ z)cpn#HRSOgHS@0{MNL$jDPTMLF^{cv z!Os4Tw4vw1LQu!jOP3R#^+umYX+zAFN@|MukhWf4*!g`&6!_2|K1iSuM@rMHu$h|* zC|yJ)YOB1pv8m%_n9VO-_k2`lqg7r~X=~Wgn|5^Uf>V=&Xo_SQ1$!<`%t16mD8o+Z zLE!?uPy4f#3z#97mzTkm^byFH<{*;OQ4xOge&7jaM{TB&eIp|p&@s4f=cK0_fq5Xl zdKqj?c^ZY&;K7aGwzIb{#v%0sWDLgue(?U@9#|C=GqwKEjLy*!U~{mi#evvCE1SKK zKjlWw-8%MWvQ23^49(@zJe_ZLgN=I7Z8LcEI$#V+|&=^zQ*lj1Gd$&ZHxT!4E83&|a2- zB0e)ix9a?3T)?awrp~~P8Cn3T21aOY!LAsTpO@EI&oB+cE-a+<=p=A(yGiIs|1se@ z4}Du%aa|iN2LGwy7)-N~g@MmvYg7*8CBXc%O#pRusvK@8Y$vJ#lR+>FMhj6q3SJa! zsMu{Cb#SttjsyXYgHhY!IS_ZiUK@7hbe0)9cTm(AM?vxht}Fp=K~SWorY^dE;|7-K zG}vS4ic4U;0bB*>TXT7!ZksvAk~UB{@xdKkPErin{|`tw~g;7>Vzi7$1kiNFbfFWS?As?FXO+2%M%+Jd9Z|2FGNDTUgy)?T37A9r{{i1 zMjUDCd)ax(cDMOO8X1WVq`?4ZcXu(AsIc%;Rn;wUyzrdW^iE5gg{%#8z%XCpw7vW& zpaEXEcTVAqo15FRGxQ<`bJTsiLy!igIQjYcfs8Af`vbNikii5%5!otr`^6*z&I25r zsC(t_PF;fQHw|Dsz}A?9?dHwRy~Dx5I<7t7N5KuKLr;JNLK1dfI?Waq9@yNu0Iti+ zr!W{KEZ7QmN=-n(W8M!Vhk`nl(4936g+7Ec4+F7q>VE*a=SW80 zuooAR$d0P;NJC1>D;K{j3_YFy&3gxA0NudWWK$&kDz*oP-X468G;(8qc1lVIsoX=w zS8_k`o&w9zNZ`vfi;dP3PY!}RDY}PtpWY+vWwW|+m^402Hq(^kAP#nSdC)M$6&w!J zB6r|+jwE4wQ2(NnlYgwP;xsrpI>Jx@7_Ckq(ZYDHfuW(Hxp^+sSzwOgb^t}8s$;O( zdVxJ-yS}C6(zpvuv&D%qzV-y?5&kL_|8ww|osZiCe)hwYfO@7?e2a7 znP|iC%5R8h;S-OKl<29fhg5NVZmm@4(EHyZYrWk%_2SdC%7F?`XEhEJm15ME^c$y+ z+}E0cI9+37Pai(K+J_ctrrhP*7G-As0Ip&R3W_h%e;gBma36~dC?!1s&frzxngA+U zx)N-w!bb3WwSuYDo#P2lM-+ej89-Mf8)BRO{JBO)=kt-h1!i3ibLgT>mB=D29RAGY z$vqdmq&FT8rKrM-WB=(=px=afAVc!eSUr}2$(aT{iKa&VL@Q^O&JYON9iv6bcJw+2L^tJU^6me zBz^+cZ~B*CTD;H0m-LZCf}g*o-%f;mx}4Hm@U#f&QEN$e^H2e>`2Cl4mv# zLIhK}RaJ8RPblW+FF+;(qc2|IPF*Ol(5Gbn!-r6`ZPva*R|KBAUu$DqP~gG81tdyB zVxsXZ+{psuiV3_&^^<&cFeRrisR#jU05(B47!|FlIfBi$Et%Eg(ni8v5a(vPdwR@L zs0x4fGr|i6jAKL9iIQ;)bcS0q;xF@BQUbm|6$aguy1Y>bGk%~hR96G=5ZPiAkbyxZ zMkXf15jdpA&G6&9fYDzJFlAaFFK2&N&4>>5@b&SDyW0Uh9>a71LBiEB)8@n}9I}h3 z14!DZ{Q$uQ2HjI&PMvojDIl<#5HFxz{*{4~vlMkkhf6~}Cj12EQo&0^#BJi@GPXK$ z{afEkagyAuBtxjI7JvJ0i-?j%*_9 zitN3!bID%WWOKi+&iD7d&;5A(?(@faJWh3VU7yc;yk5`sl4iVgF{PEAoed*6^hm&N zCIN=SgWo?0e_6<@_PML8#^2C_8ryms$UD%f@Vxq7Z@x}R^*uekDv#+)CiLf_vSSO;CS2H+n%R;BXC{VJ+go6ygIxF>$JJ~7_cHT zW+f9N(TyG?NwXpvL?H6nx8TSLmaO>7B z%`K1{1|nt?r-AK&zBc#&eF40pvk$Cw!CK-Jv;BtuZR50zOx@wp9v~(WU%!$BML!$( zG8PH|KLgCL#ii|R%lQC*Ed8C!x#;%JVi6`L>^K7X_-sW!zCQ&~{omoaxQNuxG&?2o z6+~@PZ7|40C?=GBD&BlQ*NX`9$EdJ=5DIU%1-|{SA@eX;VRayn)6#+j1r1PlzGwKe z&IwQ=>`m(H*Fk*vFcI$iJvhN4%opsXk@VAD7CgZ7!9(CT@4HajtyCC@N9oE%jQxR2 zA8jY#j19}Hgmnp}J6Pp{s}vLYRE|K53Cbc(At%FT>c=*Qh8MR2;~BOC38+KsaFYJU zIsDah%&R;#6E8 z6y7Kt<=(t-(|mK=uLWNQnKu<>PcT0tP2t&m=T)bDNKJvTq4!#lD#UhtNi4$r3BRCV z6-0tTF?2)Blg29bCz@w_NkbO`3;db0&p+oVy4&qY`xMF#08O;1X#t?84!;FfvQw_# z5dNCM7<5M;IXKAk;m(+_fp@@r^=uLb%IvHc2ZvR$4vD7#C~wc<2+e@%;NTBPIY@8= z4s6WL+ZK1ietSm(Hw}qFCLg&9XzqprKn2Pk2xKB42!GuAwCn)NEX)w+p-li*;ptC{ z*{`~RZ5?{3-@|2PWpLnu--=wv>+H~MbQ=TE?H1O9LqDB?s#MZpb^4ewQ1 z8VNE>XYhstDsy;zYz2n@dGCvtNAsZZa)kZ^z{e(a{P2pB5}!>d0zANQ1PTX}hVPrO zR2<<#gTj!WvGmzwmJ}LdDMu%DummMQbOkpT*WUJvbM-u3=ihN~p}l=P37QtvtxqiD znF5|CU2kclL7t%%h!Au8qX75Sjop|S@&j*ZY9k_gpu_ovl5!e_;s>f52?+@{4%Fr= zgfGaPkVtnp#2^o90K|ZgW9tO|Ub8iS8iM(ivcYsP&{^Si_h)0BM@N?=QID*;6M~`q zUl9ZL{#Q~;{oQXq(^SbRVqr)X#ZXoMVX*a~3b8c|>VuRnX_YtcLv#g)fMT{3Ym(zOMr&Y7I(RUu+MLQ)tQrkg$L9^Y0|i0fy@9>k9{s zpCVl2He;o?Z^iyk2{(?%j;h22x+Paw*#ACv74ew}eOM#wZ;MoSHYLkDK2Nwb;9SJNc9)KaX9rLu`1*;^7fm%&T!lEgXprF>kUc`Hi6bH+268q2 z`c0p^#lv9_wZz`;E~?fe50F0qCo@Buo7;dwyAnXNdh{4|dR@cW34Nt)qsw?4=>NAD4zf|(!gNC+t0<1vTHmQLzD64rIvufR@vBZ-x7*kliEYZ%)et zFMfUZf!jh|O%3g9NJ?^9jslhgU7U~Oao*SU1BEJ_F_~!#V%PrOpyd(TOejm-6Y}7F zSyG$ym7Mq)g8m!aMslX6A8KpAzU`&Oqa5Yb7cF=q*?mYtqT15Zc;J54m-e8v6ivsL z+A-|b9CS8zl0uA$m?RS9gsC0qE{~z@E7)*fYDq}}MeiSP;{pZH-NnIM7?P!H8_d5$%3urU1v*Cp6O#|_09wO6AfXT1X15rN{&{QvZzMyEWH`AEo%gD% z+p&yHtK-^uq0LmGLQJ#odv=tU;K|{R=OrJIsp~;}f~QA;&*g$gMcbSSMkHH*c6R&_ zAjOhILc|iBJy5m+<&p?7YbGXTKrMsy39jB7g0D^5BI$udNfB|`4k2R(0ejjM&`|Jy zNHKy{ z7#afd>qm3*3y_Mz9lo%z@W6G~+Zc{=5hf-~POv6+LbToY5{TmC0p}|yUyhHD0ke;l zdktmWsMX2A7FwPMPHt!g(q{gDACWh7KZf-b$`74t1dpw}sx@Z7>PYY2hFD(lCPh$& z#zOwW{H?^8U2XY1es;9h8+vpo;51ZK4JCdtywsJEF@t9C;2aX*yuH2Qpk0`s=Kx`QRxyvD7M zAhWNmtOQJ4&4d(#@tIRG_*Z`Zz5O_h_|%>=Ht77;XHFJR-krmH+}w6vCnWb?YAS0Q zS&Rr1*_~7r>1(Um9>l~z@WRbAi1x_L%+Je1RoSmXD#&A*Tg01Fwc!M~xL-<^Ii@$R zv!F6F)x_#r8Byna;34;LcZbjJbaG%-wQ~; z5@C+KUzbVpyA%h0+z$t=@>&0*X`SWM<&u~n~)qK_jSWoTA6KLIg zIy#=HtH1bp0K9PJ-RO7kh^a+CWo6MTgzy`Hlt%~_3l!NB{qbtsJ3CRhB1s-EF?3|Y zuLqnIyL@L_Sge9G0$Xt41@gwBxDqkZnehWsj>aL-;sd8oB3RmbX!R@PaY%h6&6}&| zP^NDzESSD})y?St4JwJ5nHhk2=;okLTnJuA89&$#yu)3C7a&1HBJm*^7q5eC9DZK} z+#NCPHg%^h)<*KwQsEiKmKUQkh>6AA*hL}Mqot(<^ubWDV*9cX{8OU|Rb6-e4pI&e zr!}F|J*(V=1mJ;Zg9ytx`M9*JqVuNZ>AX5Wkb(U3_1Q}C!s@D3@`D-(e{yZUbTwJG zK(|j_YM=Z;e+NPGX^KR2o;qA*=e)!=92^C3PC!~Jl%%r|`v5E+R5Iw&{9SZ3D6$S9 zp|nj8u%_p_x_}&Em0Lr=AyhuLxk!L-;cNk>9=eG)2(Qy4bxBEMP?Z7ySp)Gfc0@?` z!+ZI{LPFJ0a6or8UTi1{UWDOCkm=*-w!1X_`}ZAo_SM>_DtY1!Fk7S6xzyake=i@xOsr?A8dpv{nLP`_2E zU5<;w3xlZ81>hHmmEq%{Eqjq3N4s(g3Q(j&O)WUgpsKkKPu5+MLnknQ~W_oNny4_me@bYGMX7g zcO?}+L8cCANag1J{_B&9r>7}vqtoltCCvi^x(-B1S?*g4wzz9+&!nVl6NN`O`T3Pq zReQ$A6&z^u+hQB1W^P=&W^Q8QUt6of5)&QW+0!GPLh)S3(Q)sVYR0pb>1m%r9$)qJ z%Dz~sltlJRGijeb1&X>>F^EA713Jp?O#kv$V8OJJxP4Mol%b`iD@0R4B{iE=QBe_; z+;Gzl=ldyLJQN>(_>u*1z6ZA`Nh`qO+O zs0z5rAhmP^&+S>EE{;XeOG#(IBe8-Wk%BzmhloL(*n1aS*IePM$&&HF`zx(OjRWyBLI28Gt2RKo6S-98_u= z8ekqkmA(i|NdfJ17Brc*ws$`qx}QPkQKVRLP+{rssbEG!J>%;2)b z#EBTzr++r4AhS;_Lo+f9sG4Bqu{Kg-;y!RuLiiWdnlvLIJNW?WuQ@|_3SgE#*_@Vy zzdVS5I&h$c|K>ZtQOQ}bAAwkC2E1-f-iY+{^y1HK%{AX-4!$VIYZl*E+d8gN9b$Y8 zotLac0m;S$?SPxhBis2GxL^F0TliLU3efRsXBSC#nTpXg=u@50hbJx6IhIy;d zNEUfMQasb`?l0ZIW*8EQf}Wu90;q&~NVp=M(}!06LWMqEkg6YT8K;TLZXo>;ac@3Sz zLP9-97JHX3D?7XKu3b5Lw|6J-R^CnP2Fma%*BPyw*(GJEebLS7>cV7l4mKPu?I@HQ zo5Vn&8jCi$R4`?h$I#;tS9!cw<_r{B@A)6!?l+RleDde_#J7?2DxR&K4{Q1#kRLa) zN8C&ox&+M@Q1OCO!aTh=DyphJq^Tx)dMKc(MzqSavTo;J(o>97orlanf<u@|35%IL_M2N}N~#pbk zjYeN~&`so*FFd>Y5bcWq!5uuPmmqh6jTMKJY_UWp=}cLD+-Drfi2!~4?nh{Ch&+Y? zcG_F&pABN7yn0n~WDGJHp;xUE)ga$s@|~F+_(5q?dCTv$Uq34wTdnIJ?wjj`#E`k^ zD}oDEAch3?C7c<8Ao?udRCj~s&8rdU$YTj@35oO;#??6}@@?-wa6eFymBo%gL$BYU z-BVK|rxY*wwF7NCNQL_Dsu)G;-F3;m2%R4nE32xyde%p$i{C+*3HKJ>tH1?uK4#`@ zsIMTsSSU}g)I<&fxEKV!x)71gsA3iH6UZ!_aAv!?NEY5)(Uz!w|qn5DsH*<9jL%U0P&k$yC|` zUY)ya7&?{&e^=nj&|+gh1QfM2UlZt>VIAD^J&C%MfSX;tgXk-WsrhdU^bMrZ)$cUc~n(fn^juc<#U>EN5u#;g^}bj72KP2Mv9Yl$Jx_&9gcV6CD6S+MY~re5b8;2}d=d^o z)%wq>KF7d#KOE8agxiNCl~rFh0|5`bE6T z=4T~@rzuZvq5WGg5&@UGat<^WUodSj|G~`{?`(Ay68Q*D&|3e<^LjrlTSmzK%Ga%W z$9BG$Ef1<=$QtyjSDO~N!DEV#U=ZNDgNu*vZ_EQ+@2y)#JxPMVAi}1bDzg}bgh<2B zfX>}Vr!T7s8Z_8$`rfp%uW_Ff>yWC*OM^u0;hLg1yOI`=elz!TCAYql3y*VXxU4f^Ty+Q=yEEuJ4@dEL>gf zv4}oQ)m(L56MI98ibVJ^K730iH1*@VSRZxZO^zhJuhx^e$*-5FrjRaAwY86+g$JaE zje{fLBd5bBaZ=Esr)8uhB(OnAVqkC~c^m-Ap+X7`J6l`tlhj+?2PNqUeiHJH4+}vu z-P7im#y;MAfy=TAKSulnB5(~wgYQ@J3kdE-@Gs2+@ktLdur`Cc4=+It_W0W@K`zcM z+?#kur=ax!jouvy9KpSyL_$Hq3&H9FM@Rb`%J7Gx4Mt!q0t2O?u70HiqY=;=Qveic zkhTVBAe8b72FB?4xWsc%i_leo)}apxxg&5^gB^$0BK+9t(+@PvB0nVMhQtuGz|NkK)vDu(KWV5z5^Qh znit=@y13Lcfyl+D3m|Li+`wu&vEKlYR^JeHzcx_J517*>rHC0cAA0`^xebPeznD6b zYI^#uS(oOEm-$7xxd(IY;57}ZugAh7SUKO2X`=S}fy~TwXb!#nlFw?_yxgo8l_W;{ zdNHCE5PoWpAIPt0osch2P(XnART|1Y;C08wsN43Ue5Md4`}r z%CHyK)`U(MpIm$nk!T3WZy-3D>YFe4OnS82HLx{WFN>Y(X&qG7zYdF}bcZ zUy%$nCaYCST`Jcfyucke1j1(l*bl&xF*7s!aXh_9o{C|9fnq3Wz!Q!VXgD7707?w< z)9-?SNGnDfI}|*x(NI<<(nx}F2`MQszr^+zPrSnBBG9-3e*j*p0$XxIE)ZY!svrfz zc@i=m8T6?^8ZHUz(c+8alJgg_)<=qHlN7$^)NewN6>MSfXTS!qhsz^Q#BqiE&ZUe% ziHGm&BMqfSDS& z_y-0qd2R0P!HcdQfgkdVv@hNo^_HT!CL|S6ai1;)WW97by!N4CVbcS57#SZaQc_UV z>qFoR+mQiSHfikO+XJXyj9sd_WB49uw}hxH`Nxmx2*xb{+JhAi_rOqLOlYc7Mt_bS z(vURtl21OB$Vt!u9nYu$#W*pclZx}ji#`d=qn#Zc*YGii@HH`TG%*HhZKox25^xD+ zF3P8wu!JV~v(XV8(q9Z15s4)n62E@^@?ly|LUEW{VwyS_=OXL=xU+mUy;`DqXNAMT zQZkj8Wesk3SXVhz#x&cq^72n#`EmHljYdaOX=M6}H)yNjlvfb0WHmfEQ;`SD8e~|& zhYk3<1%F_L^84OzpuK~-;CF&W)zoINAQp7XCKPeG@2x+fGu0@s#oHH-^PeS=jt&kF zk*w^PWOuo_eQLQ7+tBhtx$g!Y3RwA|FL)#)1FBBQRCfj=CjbqwK>$^wBch|GTZE`Y zwDFN}6`(c{y8?;uij9tE*BNeqOro0#lZIA$D#%3aP-pp!qN0?zICC>ItRQ8NM*R^c z(1evrNlOoVx*|nEmyN~{3JP?*YT&j7^r94cis7Oov+t46YQ;xJs_YnF>{=cvngU$g ztIn21kdBTHDvTb}e{L2S>cJq(P?m;BOk4#cJjo+Z<16+fMedh;d}2Q-+UMuG5=xyp zh@D{NOvvmdlR=rGv@|xDDvg8-6-Z(z^Y!dg*tobh08s>z6aXanJm6+!y>llskP4*Q z$*?JRACJBK16?j;YJ!~Z@~VcK8h{_()lZ$j8Ia`b*yrKq-+Y``wKwv{8&RZV_7!9+ zm|i31TmwSl1c@L&&3Ee5-&WGk3rrJao7u)LaV*!bMDEW5>nzL_xh>ZL_ zDC55VQx34n^S4501YBGoW~`{Kt@vtPZgLkP?&?*=ay|S*R1}~KRoAzyM>coxaz3|H zQ!PU_==p2xaT#ARz_;wF@14c{-O8VW~0-dX;4~R@&0ti-=y>CQCV@ZcS=e<{PJ6ppYPu* zFnw`x;;MJ4uDt|5cHI9lCA)B)Y~)+|wpRJ$eHL>sI$@beyPMl(7KB4@6TYl)1&M#?``$RPtDHg#17J3%h;6o2cU%C>I`4Ky0%Y>a^lM4( z%GyTV>UaH20Z0@z=YihB+jJa9`cfeWm!1kpltL=2XY~ z!sGoE#mCjRm`5tkF1(z|NPvziEHso;t2DE+GVZ!-HbDQW#A2-hpqYf015>q#>cP@E zp5_wFg8{wqHT*yTs6aF7>9Tl!L7LYFeZM#P<6zR$*K2F%!W@P=eQo3}tK znm0b^i7?S=#iTIT#ZbLsEb^TRFQfDVVYi_0My)+_$AlZ4!iE0+mG7R2`R1IkGPz{T zbQ7GtH2yL%!oNGjuCFy#YeMh1{TvCkqmb>dhX==yNSpt>W*G9~fVl+3advSLuoq={ zdC1`ZQdudGZwqoQC|E&l_$DZb`N4x5CrS5giI)L-fb=Md*w}8> zF6Mr0Cz$%aazE^3ufSuMAmMUtTU#XP0qW&KuKQjTaF|WuHIu}kLWZnI%ryAEnrsky zxFt=DVd0#L;P>fG33-E`-edN{aHGEN;{zPa&7ljLBU_QlD3F`K2w-tr@8&Pu}_Xh(n4QOIB>`NECw%+M9m>YH&!12AHfC*k5m32|i72 zo>FU7)u`Ou6{xqN8_3wR#mzI$co19EaW?zFdqnNxhHe|u3 z%5y;bY)`+?%j>dLk$QAWG{8%;vdhci`J(iF@M@kv?+pr)k$>#;wLJNOX;<--n@ze$ z{Z-P}&J7qf&^VfuIP%@VbBliM79G%IaQM~Q@CIxmY~M0rxI@3cp=x^1PVi!L`xYg~ z4C9+O!Sb;yZFdoOxy=X-jdwECrYgm01RjdGK8ktP`L2-D@P*X1>u2oL=2byh(`$+l z%;!O9jx6CWd4bpZJ&JPSExFA)$mG=3pK6WW6c_)9K@XQVul){3NJy-IfYjOKp>-%? zaCUZTc~$4!>u=u-=m8Xlv5}E1931Kpr#v_a;GZU74PEC!O%THEKGD#S4nQejH(Og< z#p@tf0@f429q@~#W#h#|F$Wf#Tp&H!+3}VW*dB<8ynOi*B3-wk+=l|_U_$r5i!^}= zwV9QEm3;2*H6*^YnE(ZZzVTd>nBxyuS5{R}AsF{C>DZ1MrSnYhbg@wJdDBlDJ zPoDIV#a?XmmApZ7NK}=_!jGJ+=+%4p&|7# zG$}abGk^9cCX(xq?_Qf?76(AGPjej!eLVoosT1EqSn_|NK&szV3K1tjcN5TPpgE3?*UHQpU7W0ep*o-QG`h>b^7-sEM%<7o(kH@d1^!k9 z(bqpPaGQq*3zvlW7e9Z>eXlc9%`%hrHxJx~Ngr{~aj~&|>PH+LHV+q0!ic-oi3+0V z68s67{a+QIb&#(juv{@@Ds>b}xUY>3bL3x!Hc~v&2`kjun30i~LFV3Pdx*r_hx!-^z6)$M?^H$dL%W8of8|kl9~vZ z8?_Nj=p?dmGI8pk)mA6bM8)iSybr(&d753DJ@f;6i4y6l2r62So)EGmd|CNzcx80p zQauYIN;y2yDL`Q$JtA^)Y+b?gYvB_%vWQ{5MQ2 zE&Cu{0q$Z7>Q(36rTM|^q5yvyN;0w!QOni_22(})MIOh$1N<#}aCSk)C1{@nIVQj% zxcwgO&x~{?UZJ#Eo%Q)hhYHIcRrq^7h$mbm-)O-mxV^huo1G#;e;rXrj-m5x-|eWI zIs!40&+91?r5u5MB~J|T-6eKnG;VWmzKi?rFA-a%>9y_>b3W>ru;2(LlNTXPk1pXa zPVn~(BnyWOfkg{Z=PtLjN~T};+O#K5b-!^h>y2^xwNv6L^^3$Rri8j>7b1hc;{ z_6jbCkcF&{=>+FAhw3gB1pWO-OvmcV}m0%L=59@8!~BBrT1 z3Xmn-v){qhjUxC9RYEZZ^2IjTYTTL?UgsNn`)F=AIx0#$dl2?p{rAMx|G|SE8AEW8 zk$OjmlJULx!ECG5Wr7qDEoxKGgGfu0GA3^BS6fDn-`t)xxLj&-YsNLUTFu%(HKRux zqP{Wc>G_~gBUd?-?ztaau-|FFpHQ5YwF8^U1|_1Pw;LbLER0We3(E7*6)clVC}IJd z_Vxu!{8uw&zgR9V{Og&z=Z#$F-2dPz!xx%s#5M0UH2H7tFJ`K%3-$)>?2IH1{)pM! zna$+o7342rm1AmaTO1f@YJHnjRxM9X7#}GPi}5GfGuEj5zIwP^Qp%s2)|NWTZ2!q_*gzqA@1(>lIr4o zr!y9Yh^!(3=!DiwUvS29uE7Aeh~VI%eyXppvveLmK1vo&5=H^!H=NHaC^)5*G|I>^ z8Qww_DB0Fv6rO2f_I$FgP@f2lJj!x7FeH;5wRINV?AA2oto)_p*PF{T9)lPDX*^&7 z(=|6^?0OFl2rSD*2N&qz2qo{Z2#BxT=jUIvuvJpiytgSZm#OznukPdM*EKhsNSEH5 zMr-36(SQE<=dIS|B93D0_jP20cNUyXnOaN4b3sw(*_f0YmgkzI&NjWSG&=5?Vt@SS z+&YedLy|hQqN?g+O^ub`%p}XDix*=*DLVtUphVnIHxj*YdLozxuk-v3qUgERSL4>< zDKeY?+xwq(p<8mYTCS#& zCL%;`ckjU+uA=V7#=-JD0bP2hwbJ1}6+d3*H6d6x9OB|Ai@_ElL5|Uy5%3^RHu_bQ z+64qk-_ft_9YvgcQsgrscW_nv5w=V8^VMcEz@q=D#X?DNr|Hb#8mAe9U9`b zas00L?Kbo#FTY{vmeC2R@b}Lwb1<{tk-L5SMV&4KfG=?0fDXF+RU(C7eW&TUs~Ifa zKlAc1W5w)@dZxRV>#jo%GAc^XV!3ETZ#2u$kPA5OjTxiH?(S3JnpmYpImOeN_4S~c zNpbUSTO(f#Rg6~JcvBMp3xe&8WIt`6M@0`hyiTw&ALfP~Q606u#rfu@{IVKxm68%- zgcV_U55``svo@Z%sHNyz_aDe5{!kJ+THYyL^}Pfn^P$cL0u-+h52VF{x`9Ew;p+>s z5fVk>4NPJY2lI1?{yE;KRb_u9xP9)+b?=@@Q?S%O*J0Q|L{kmNHQ^74gUIM>qDeb% zoG+VO*tOtXzJFh!(RBp>CkNvr*~hPa<42I$1j0$B&MG8@+G4aASoVRG8L~=eBL%;^ z6pY61^{`BlYiJ~STROPvY$QiW=wLmYcse4C(!HaRmA85r6T{^o6dn`=^|FN__p3g- ztdt)KQzQJ}M-k8M>?*w6LibY3JSiF z#9pqhe-imw#l;JwnJi8R1W@E}f`fCmW?vD~*y3Rgp!+$`YoFoEaeE04hl+;FQJ7`{B0z9CQPMl+=+%D?jdN;n{x>(Xt`+iWE*V>ei zuv(qB^eo_r^sI31Cn**t z;atAv;Mhf+DP(-~1&hN?pQV1kXiTY!l4*YhtW++~b(+@AsNs2N&eMfFNiUPX?x6`w zN;=W05RtPxtv^j*K{;&ds1Lb(F9CET0RRx7 zjUa0UruaB*pkO|r9jLJYNBV}?O3%q@8y!W{gHWY{w^>O^3G{}3640PHgX|Wda5fee z$N*DT^M!}GO;D;x4<@CO;o?5BSl(wIn&#>{S)SI*96Nm4 zoak$_zn;+=@jN70V6r@KPB~WrPB_690hiL}hK9@aei4CzdA@!`e^!}Cau8sWwt=D^ z5=3&q?G3i~O?@}MKuY{u1bvU=Zx7~fQ$GCELbtM8|BHP#)a$3>fAXjC$c+nFa4^Kat zJI0A4aqtTRfmkggWI^qHv76nWt_<)G&d(Q0PaZ0^ATtZL@%=HI$>>{eo-_JWS4oX; zrOE+UD48s`fXQQ#aPjQ0+VT9$`qAdzE5;b10h*Kg8O7d?j>vZj5PxT4$X)dDWq2rm zws5Ui-P@qnEa#0&ukGtT=Os?ouF-Q%?VsQNPdf{In~z{|4hXLSG{-6e_}LEkLgEqpk6>+o*n0IxaxNEK_n`7;*U0Z18Ui;Y z(2P|uW)!rE|M$VkkTp@)!knFS8lmLadg7v7wwxS;d)F?_`E(WL_GuJt~zjRW4 zar+MG=bQ9zR5d>6YqgV7eqan9Sm9#EBYXfP6-UAC%b0Cz6J}g=bPo&G=jT^{*&1|G z<`oxT_3xESjEfSjCowuPkkquqdzP*R)OP*K5|tf#2i zZc(-}b+4ABE=`H;^nQaCk2&!of$b(G%Tq}|#PQcOiE78$7`od-!)_~ey(w%cx-JpG zQy$8IiJ8pZ{bWy5KVwGm_(Txo6JzlGb8~Uy;}4sgZ%0KjgYjhNoOA>1HQ;zEUIl$; z!!2;ks#pspTfcfmVE+&aDIacoD~>?OZ7x#rn0lK+MzAcTW=~B`4G;5)it0i1@nb&` zFhTRGw=^|XR#nwofUbNQ&I7b~;L^XItMQB?I~$vWt*yuYy0M{QdsqB;7^Nch16Qbw zF7fbudC$&2H%I{>pww6az{hG!us^dQ~mZf1<$<>NHPnk zmX~`dxB{{ehh`Q zN7uEo^>#AQ+z@1N8#H$;(o?N>kxFu9rm-+QQ`=gAgLn@Zf>3=V0Vee4Vp_L zIfHFjj|M-08iC3yA~Y2KOf?Li0sI&M%*sjxh^;E`$3eJE^27f6X^04s9cncyNiBp1 z)M<6Ndtw6ZwE*+*3G%-<+w1#GsC4-c`4CF87k2eoZ*mR(*Q8kIjo)L5tHUlg1%6$S z0HvlS8CjV1WX#aeG1ir;HIxTPKp*h%+`o0psr}vB=g;D_Vy56(V&@y)fH;e6Wql$X zyQ^emD}yqY1B`BhReHz!MZ~0}xh~^LC#@A$BZgOFpITV(Ka!FX?iyTJdxDe{qn!W1 zlXiJT(VOpU(0^D&izKM`)E}we&I?TpGk8Yg&?BJh^|SbUo7a2La2!5=u1xd5mG$0s zStTO-sHRQPr8O~(rj|N_^{B?#4qDrvNEa39PtF-L1bPo3f(*nLxfMr1BZF%Zg7$$U zg?tQDN5_f)P8(m_96*&Y&Hhq#F&JG{%xxV(QUUQ;;9r&KiHnPa_fk_?iPprzoenC0 zrZ#~^@Q1*(#GZI{j=hwE-2;-l|Ny z-#7OaHLMpe;-A;hSQ4Ce3*zY3ICXYdE}otKX=!;XG?4QPo+y^S!kccEfZF~zlmyYI zj<1gPaMwk)SO8yiRxZnaS1L)olhs%>k+o&xTmkMdC>gzU$(^8u*80fS z$Rv97CCi7Dx}IQ6u~7ab^xX{c|CPCZJ5PsUuz&(eZuVzspy0_#QUmg*gxs^RV7mYtL?=#xJqG$5E6kgIf>Hn|Eu$@yS^TS`d;j20=tothmH$|4%tC?5 z>d1lPMzZ14?3M3ebQJSeVrI06j2axPJOAyrAAxYW0s?c7 zvlCWesp7=EZV|oB`QH%^+=0T+-(Mnus`s(kj=0ge(3>K2ll;b<6F+pFIy`iT7EY`+ zq1UFK-Zl4C_=m;*yZ(pwdDd%QcIbVpTOe(7bJ#2A%Tm=a>-o@pry0c%FVy)a~ zRA=(Bh1aMGKIgl4H4auHqHRZ>GCjC``?JjyEAPx?;l(YzZ{t6jU+ma~?zcyahEcvM z$aOtDp1E;#->8qy7&v!H8+kCPs5}{IgXqd%)s6Ji>wXIyziS z5cPmK1U8rvxV`J3SNroe-7nbuea&HMe0*+&x?|a?eaHP7v9m1%I#mN45U+86m!yQzBdVDUf9X2!h`u=s@f>b(~SS1-1Wd}p(Q=5P>i-^FoMv07m ztLf@VUN0$;tE~;1pD(w#|MBL7YH!tz(GvGLSzb-32#v3D`eWm6E63k+TGQTvk*G+o z6wkRf?c8ylid(;b|G7r3JX+Qsh)+achA0_?jt&rQU9EN)?nFeeN2kxXOvnFC^{=Q1 zz%Od?>uJy?YH1z5YK&T&2nLTYKGnU<0-dcIkikGfQSQjTLdVUms;5^AzbFRE%ZN_| z%uKSN@vWEqRPTSKO^zx4m%k)L!2Z+lhtXX_H~d5m*zdU8+4Ibvh|`4T*%&>I&es;F z3TR8>73-pKxVFmlpLc>Lw$UHq7&xy!lFXs2W3?RKgIE0ORp30caOgsceFWE>iK*$C zPt7wJR-brGgnKgt9-=n_JVKt+5_nHSs5@lO3U-VpwN$r?DA*P5a1_YW+stVH82Dqi zXXA2kQv`cxpi}})qKa#i{}s%ACAGmgBzw4JEm&D;zOZl<&HR=Xda!a*|9~s6UhP|0 zU%foB`uzO+walk|FE_WiFI}cpNc>9r z-yVk~lSBm>6@ee>+<9@vV>c%{Yu@+BFVCYFzHY7(|5o{$(A3t#8*#`c*Lw1ViDNQv z_{lo-+=I&^8PcD>?&(>09Domv$MfX%e;Z1Wg#uq7bPsrY8M5>Q`>6UZ_I3goee|><|R9An=$tn7p<9gMtt5szkKO8)!YM;RL(E=w&R|$!C z=`#vuivgf7-;mwvV!fLyh<-rV|309y90cB9ULIMKNLz%3NUkMF;ZSYQ2 z_3eWPT&0@7f$LdWwtyZXD>w9gTwIGuDL%e*d|W(*j9;8a|Q4*#wP5cXuJMEPkkDcm2Kxk%lWw+o;-d06M zX{5t^mvpB0Q{hkB>=m2gR3~ zWTSc_9|tR!S)tw<kkJWQSP=zey?ZyK^Yu>u4;rOOA|Ur*a_Ued z{i)N&SDCF(2aV0gukHpWaO8mL4*jtM`ScS(n<`(F@!QL?OS0ls0 zjOwd;@nuZ?{o3rcM?&DNxK4DH`G^uH)R|`hvT{ChJX1>&cr~thHI`s!fx~NiZf~L@ zwz4v}tn9-#k7LgU|0tOV0UYez53#g{R#w3EZtrgxV-h5gU3VSr<+?9)^h$Fyd9tQr zeah=kH}yHhH?+29!_3`xcdCp)S#a^P$yXa^#^CK0nZ7(wj>W`oKyEZHxlQPc*20@O zF28FL3EddsU#;n3O>OJL+0E};M!~^?!pQd087eudloj@4_&GGJ2(k!v0^p;-ZCSdl zaPuahUFh+^Wv{b7zfsT^c%hI~xx3IUb9TllEgg*`@leUSEO|0%zOw*i#VYE_Zf7Ta zv?#TjAo?hskn0nnbw}XLSm<6*ZYF zS1iak5~?~QuF&9wIOvNEnKlIf&u0ZrfgZ3x5N|?|_jonEtj<6wRN-s5=az zyx}&!$A0+|AvPYK>5F;CHMUyw{;%8HxQgBMiF`NQ{&Z#A=sO-%nD?uhnK6T?4MHi} zyO6;C&HG_-iFmqeKYk3cEJZSJd*ox+mE?~}X?|Y}C>_hU7^DFw=kMiZ42|CA=I0zJ z!c2s4D&pxseHWZi&follg1EFw{gtxyD{a%fynerM(uJcq6BGT}xVIKSI&XNis}9n( zWZ^IfmUmrJnQxDNhV)o1$`LT{h-n@ha{>Lwy?Y5+S@V~%Iw196y!l1d`ym>pFZiVo zJZab&7_xG6m)2{?QJzy16|9DadM2axg??DQ;}aX8Nv*M&vfRF5!$}$TIom8F|IX}s zEoDZ+zdvD;kfSBoj3coL35Q2UY*rWUHv!YfteDVba&PkIPX$HAiE6=f-v54T9Mx`@ zT8~b7S=p??<;Rabez_*U-&54CrWSB9H1yQd6W>5R-rhD=RNP}$qPfjFCZ}C3eE9mp zx6DisViNf4s+KrsE6spe7>;5H3`(7j}L+}E=sRqr1cW}yDIioK(yQimX$BmVh=A-S=%Joe9<=D>wFU96c8 zxZBdTE%2Mz+c!plch&S>JdovSgUbL!fgscP?lqA0TDr!AV;4+qk-rufuTmDywm?IA z(ZqWNjGJ`5MDm~yGhi))bpw24jP&uk4TN1XNZYz%!WjB}$fC9^y0mtOiz}!AqP?eQ zXRTI~$2l#ew>W!Q%_uZpqdq^1{DY&*4HjK#XPUk(XP;F(U2&%kJ!@kg=s$Kxa7nIS zRafQa=1GK!$Xy^v{fheJghNXUmJnNGW3&I>T!pa^7N?MKABPka+f;lnH6i^MQu=Z> zz|{dFVu;F!j*Z3oQv=xm5G}>8eSMTG|1~&!Axb1J=G{AOSni-b@BX$6U=e+*)8h%zg%IoW=NN4ze5OQ3+2%!Rt;9%uR(*%pc<6Q9#!)?HW zA|aZr9GsIFyS=Hgm^ja$PeL;cHd*M}Bzhn)5mehJcTk{7M8lAHY&Yuw*9xcZ=-!&n^hQjh6ycvg=hZQwMEO{4_{B<0y{EacF}~fz zsjSTs!>|5$slT&3@!VQ}#$(l3T!|G=bnm9Gi1w2ko84d%i~J)49L-}5bq%WM7RBOG ze?~Dg{Fo1gQKUA25fR&{tE)#EqLq@+`M9|u5Z}`Q0ac2-d(kO1AfOpQbZQn{Mi-cV z{%Z(4gO@IT8eE3m1!-%xAb|z?X&iDC%dJMx*+f0SPIKrXuZg&T%w~Fe8VtCA8xn*y zUkVGPFzZgZ(n2@}BX;lx2ys}Rii=Of4~JPwk2tmfEqepz8(_ddGF3lh*n_siuZ`d4 z;7E>){BCe}{AlN;GWy*O{D!>A`FRhBM8BweOS{6f(p5uEox7^ivX1NfbG4XmqY^2p z+d`w7Mi-_|kJCT_H311>n2a(n?DQT`Pst0IOQifusG4F*NgeRX(Wvirrxrg`8$Pkk z*2w!+GMkQYob7-$h;|0Qvn6-7ZO8NPH7-_ zHml!KQR4@0VT)wX=X{Eg@GynD{4Z@|BZ(`@c^P)>b6Hqet_eAO3~4{;PKy^@e7JYb#cq|{^yRiE-+I*`|bT4Ed5itPp1l{oaa8-4SR%AXEaR2^mbV2eAmQg9-i~L|%Y<05(1|BV$*x@CZ~EJ+C44 zu?`jnWDYe1G%%B7A)t939U|$)X`|NL@*(eyM*YDvAr{ ziO^6V7iMNE=zIC+(`ODF zoYQk<4`s~Amr?GuwIgrBM+)@o%qRG)p8S}QX-$nckDV#NnMr=$!PG2lehJ-D$O`{2 zjqBrykO~l2!hrN(?U`3#x0;!Zxd#z@5XA&+(-P=1>KR!aAioBX9tcGR6H_oGahQ4P zt-@Uj%CrDcpZxsaFdX!I2}~dd8$u_z>6!r>I%^b0pB-Q-1HG@WD8`+FTX0{1p~m2s ziv0PR0OrQ|U$t05GE*?WjY1d9z)DtApT4xE3e(I-oU`8zbjd|aA0AX%nsdSsDP9HTYwCkKY&;y35#Z)0LQL3eTS;+gY?sHo@fS0-gb0KR>{poJ5nB>zEy zpJ(DxhTX&%mCa9cnFx~>uqt3ugi~kNTE*T7<-hVzuZ2uSjL7qZ?t{Dt9Axnd3lJy- zau@U(3l! zIL9)j{^RLVU?5>>={L|e`phVzKLdEdxB%iEcr%k69i8#%)1T=^tTs4JNw~SW`S~lr z7fUk7GY57Oc*T0f1{W11(8GBkY!H2uC|!u#0WqEU_+nT5WsWhpEMNro?92=ubqwYq z>sFux$fylSOCgZ>?RDN<>1H^PU|Q+-3UKBwt*sH*r)Ok9O2)qqPe5RQ!pYRpA?=6R z4*ffX6|4LouHHJTtF3$ERp}N%ltxjyr5iz%1}W*3MnF75(I{vp5S8QokFI-qZb)Tzsv;c z_k67rs380ih18G!{_&xq8oFLVk2}A*mD33*W|K)Ynf~mGEw7d!*Ps_ zjX{+M)uD`()Q9H${CurfuiXD0VN6MOFd%9r&PiT659s0_v+<3>Y5I=twgnltRUDu; z0QaBh;k!fhUoAo-XwmPr+Pk8f|I4IrcxzNDAU2l$O;Up2_f`lC3Z*E7T?{OGo=59q z=H`w-`GuKXQPCPIHlz=jbWT_ONE?UzLvg<@sA0cpJeeCBVy52^cR9ltUd!L!IZYOZ zPs#&AV+ryQiYOU%=lIA-h$pjDs;B?26}5_jXhk9VQSg4u;Z8yQFa;IuIVL)rNk7^1 zjG@*xy|}pbk&Q2Qc5s3QW@p>~JszRpSJ6;YlUA#oDA1_Ab;}#MB_Q0(<9O3&aj_J( zK^#iqM+^+Vr>kxWH7pp%!2j`Wz%&VN~4)L`TTKXG6gXt){*^*HV!9pOjM* zjc|7r;cC{?qrAf-uimnU5xDr&l#Y#XwpzREtH ze@p#{F~8>R$}hkud^%+O`g|cgl9+YYXSD3`(Hu)bT@H^ek{CRV~|A&;v{^A>>#TmB6;o&y50yfw(9tJET z%5+RUvvqPIp~RENv(#esWsSk7IRBFp-+&F8^WX8+*I#W{OHBO3&hCnj?@yC9I;6)0 z-^0+U3xcyMzQeHUn7cv-G&ekNeEZ^;el@mhG1C1pHz5wF(p6}=aVh+NTuF1++L<1I z$V0>ZRlD6S3fNSjS7!bg@%qf2AZmtHn|E%!Q1t5g`JLV3fsZwJnbw;fS8~GFX(sBCMt)A_=I`@=1wU6{y#C#jbNQ;9YN_W*L%_CnH7llnfVx| z+Xn}lO%TH|?jb~4l+p`RNtU8iO0p$CORJYJfmkkNULJP%iuiw}eJdZjoKkC7qt6ksixz}b$0=>+7& z0{#hf34Y3TCLDS6(yK zK-Q9xmj|z2=8#$6^cCE@a`WK=#q;LZfAf9^r0JL9hK7a)2I+wAg6boc_?#TEl4H^^ z7W<--l1T0z2es(0ujd{f41d$Rey;8u2Qz56@sySwa&h?!dyEcObaWa{E?{B6I1Pq42&s9++H*-s2t2q&M&WAIH|-hm2dqLp z5H*qNb@jm&SZ{NLpLD^*4I=@#c(yddL8;}*bKUgsgL@2;<##Zte{H;+nVlUMu_vaY z(lua{^JNiHR-K=1PYlMeWzC%dceW)5)*S6IQPKsmQajvd-pEz!w2SI)yH&Q zu}K&74Bc+uL&os&OOQ~HAccG+^0D9QDybPKh1N7*BKJ>qb-97AGt7^ynw+Y#vejv6 zE&~)fsJm^~vH#nL5I++5!ze)a>y3g1u2F+~N{I7OLsxeeMDt*ijO*>`fh_2|gv$~H zC`xFK$Tq$G{lAB^df<$K`mxz9FN-h&O%4C{?f2NDNf#IHU5c8T-6mz=8!vwW#6Iw> zY}R)o?7I{cDNwwNUEjFFwF0wDQO~cRph}N<+NK}o>)Q@`aIrFs14*88ISGkJfJq0) zVkJt-8JACJTcx;LT5^PcVxrc|m!$??!07U}-UT@yD3{ZXA=>9v@(<&uq-pHM zp~YHo$4GzpqS*vtPL>5{3P~T}R|BCmsOI0EuDn{_`gcahq_OW44UcjH_-(7ko>oWa zG;*lvO;)CRJnvAY*L9=0Iq}t19U-A)R~I%HH$@Y`^Y-?pGoG`09)5@1E=rqS%Wrzm z{XXhEVJi*~KD2i}b=#ZY$aqe>P%fJ>lp|ZBs`eX#2t{wpZi)eK`XzNnh2lzuC7Ma! z+gnj{4bypT*x3J>{1LNzN2CKj*CVsrI=5}&kmq!(-spgdev-Ot0hS$rp??EO+8$){ zR7hPrK#xyOPBs8`*#?Z%;d`U<04q9}D!Ob}y6-|(uA&Cp0F=Bxr>4A<pFZj5) zRJF7OXld7=?4xSauZP1K=I|m_7U2H}Yc5>1;8XyU?oV*ULJlhDT?c3A&M4YQN`42D zVaAs)kc6Ce2`RTOd-EUs{7`#SpP0%67Z182aFtL<;=Rd?Y=a=f0sOxlQ;_(e%?Xg; z;w8|mNCnRB=mBJbfCN2Gc`ghZ_LratNY6EB^P8!$?`&>vhFkEjbD(!iJ%axw+)G>~ z5%)j%dZ3GQPZM28o-TC?;q>yY%wJ`sgU(NRNO%=>#f>7GuF2ipjO-7I zkskL(V_YW7!Zs(07mZ&d$zS5-&clc_eJ5J%t6_ zo_{~vUH&yFUGAn=CX0@KZ4p>C53{=d{uHgsR}2g%KO1K(H%CjRD+{;h7p)zSHs`7+ z-I(GVWxZUqs&2zR`-k=Yq7Khz7uHwB2x2@9fNK}m)||obcH5|U9O^j{V6qrHf)Nqy zUN_MEveG^c1~0Cz0w<#zz!5OsXJTeHGBN@LLrV-G`Rx%@obC2@3FtPy@cBqWO#BQH z2jzxG!ubN0M}exkx-=^F!}zVbg2&4T+iJjTbbkf|k>37>f* z0Jgonye5$pN#ghj>8PP4;0Em)OkqId&p>Mf2C2jr2nI;<(!zorAhO{1WM^mN-e+qb z9JB?LiWN7h(^Giu(1<}fzzZFK3tdB zXm|C?I9J7m%$a45j5ABX9PtVZj!XbsK>HiSQ(vHY@w5rz44mPw3@Z`k97Jxfs%; zjXTGsr*K<^0CWR$7&t=zmlNCJ9tkQ9o;CO?Q3xHOt1>C-?fyXs z2ngUk%E`6j4#6vNc80WBez zAF_Tk)Be>H7w7x#P!?`7YRbxypFTxKN8frzixkWTlP|}W?n`LL2NG~S4^sj5`)lyt zGzuRVaWFr5vURkv1@5OPnvL5M?;LeTMV!pmmsoV#VM6euq?`YiWAJItFPkaSaF4Fu zRDRi?jxCS!`oCDu^n0-nitMXO0vaoeo|%kWwX!hL2hqHchu(2+h+*4cc(O~4eUHWi zR1PG8!wNG;k$_WRyqzC!ff`ocA`dQ@aI(R91HsAg2H`kEUjsg~{-@$%wzEg~=`;1} zU2X<~q5HNBBV&GfIj4n7A%M3doFILavfb@gU|}IM7t}BP`fL_em{UmP6@Y)dlj)+6 z%+Dn#2pk-7LzpCh%Nd#YumElXFzXn63x^o^o4%?J-Y;HEbi(W28JQ45_}iZ6@82 z>1Ppbenxb}4VeFt9BF5wSXfTqv#UkzXJP$1@-*wqX}_M9mL>r66uJ%gd`(|m_pDtD zRbP!6xlV+TNrvL|7nwG*>AuA;D|A-KSE_PBN@pdAbwly!y&BICHv7h6m}t5YqOv(M zUU(U1B6={g{nF_!dt;9NAaeX1E8(H)+Mk}p+68hrK;^I?POhcvz4{qw>&`^?+)`5Tu+H`Oz zq-CqZAY)JoArQ_4oR<)+JWvg{Fu?gZ2mMNDvZw*?QKscPD6b6qLB%OrFXnoWSrd83 zxA5#>17*9k}1SxzWKr5+W!d7OtE@{*gJ@; znZt{DGPl*u3_+>QPxH=}sfHua93W+R zu&WCl`L6Z^#MCM&K?!rLau=?8m8GRX!Mc@3Dd5YFX`oQ5rgB6)ZzmP;H@|yZMwDfDcdw$<-jNiYyAD+r*3I^gYsS1 zz(qPE0Z4=i0I^{&gFyv?xClKUN3RrDYf`lW`{$Q1_5{e`pdUch#f>1K4S-JX7TYn* z_^eQ@SUHiL$mxW{gk|soVVWmKVkNS7Jt=@!O;RWFUc`A$t19`?RMvxpmk)6`&8R+O zYGgf-@i-7U+%c2h8@Ur>w{So+kIInsZS?6Sv9|Q$wuXJM&OBti;^?z>wBHmhrg#Q z(oqVl+$Czf%5l+iRhdt%zTVG2-FftE<#GGN8LOnat#Su95$|@=5293>#H&DZL;UYFd2QES>IG?3tf{*yOwMEbMF-V~1g^u;)~bx5%yVvoLT zvcJCafJtC^7~;W{2RE=C9sl$vt8Q*J&BbpxFobvdUtg=!-@!QFUc{qsr=$IlR`KiJ z#r<5B?yU)%veLK01U6@@yCODiZC{+7P!p7>$sPDBXytJ+j$Np4qa!cYi7sBghL>+; zZB0+u4W)HU!{bMdTwgJRJEp2^QZq7^ysVm`rumwm4`EFg%BibWND3hQD9=MA#ei4$ z;xqvwR~?{{0N(^4!(v?R9m$zQj!fNER!t<+%tHgaA5U&c-}OB_$M^x+*CZo4Fce>aByC8c)jVZ z-$g>a(lcUrEn6;GvsQ1vT=QkW&~ULXQWHypiQw$ZS>pv3UGdfL552|x`y+omqNUyP zYgj%wz>|rw0-P6vPYXIW8Mk5kdv1bZvh%8W3wx)Ol%}oGf`!Y~y6doFmYUiiC|&cVAk_+7KjJbs&i5n6@|nF+9w=uWK-%d`V#FwejHF$P5wPQQ(?E1 zE|-X#D$*qO)EjG+{jkk9DlY(~{cjO`b%}BjOv%p9kBVC9 z`h?UO5q@KmwVj`GBLiK8zZP&*}!=9`W|vtU2S_W z_9~(LtMOMZvU$|kLUscqvdi`Ms1$h*UzVp8|V&j{WZixn<$ z{oQ5TC?DMl_phtlm6iJc{WadzyE;0?{&k%Vz&}Zwi-~Dq3jqm$5nebkBu2b>F8Vg3bCSZcpz8Bfa_DYF}Psqj*7K=*%xN_K?*r(mxk}wv0L#eeZb` zMJWfQr>}aWP|cF12eqyEL2%NiA5pQikNa#L$%71br)8aa#-%&zrhj+-?BLql)LQK_O0J9BEg8k7Wjh{p_0Br3h3zoUd~W);;dr9<;c(r< zz)XM^UI(+*s#0D#tpX+!Q?28!u6O)WPce35w!qwHdODlF7EmU9G?sdz= zzFAw|=5yyAnQ%5RT*S7vUMVObGd1lQ92^@+xeKi5wmnMwKNJ)rSeUZ~dX#w?8PXqQ zuxS7kzsYAG>4&$w_IvjF@*GdxZ}P>9fGXnCH_?k83#zKbXiYq3otbezf-4>DVcWhn zBv>d}ZxgF>{+du~kp8t}J@dG<=F1SuSARAqC+E|s`afhjYR{_6qc_&mf9&<2CThH~ z`#X?;e;hh_nkBq<7foB~^H@F@0fhC8T2${#<8BzSn@Q0YG4x0uJsJPH z;7)eXnW;RoD;TQKpmO$GD6xZNy6U`m_V2BNih9Ls40b6e+A8w~RBTDaMt$XyfK#NV zrS-&8t+mE}qN9EC>gvtTlvH1QRI%IXTRy&y>guM_(%-bS--ZW^@$n71x~19Uk2->p zu(2K3Ln=-BVhXjC%{ey*(>-!z_aRQhRIDX`eVx8_>prX+rsAU(AM1zIw3q@7KgGoC zS+uLE>3^)geMdF>JiJIv?f0mO8lh$Mzat|+?Rh@_ZfbW`08i-C>_~dN;GmU9lckFf zcNs1DjvWtPE;=6nN^oV#38STEd8w*J`~6E3Lt&Z{-Ol5md$*aiK6kjkJR$-VmvoZy zK;hcqB(~(fzcg={cS!Zoo`%ui)41Oz&vJ$WGDP<&(n~65IQ>t@o1)drjTKynA60#- zq*2t8NzF-xa0hr-PG^2A?R&q?(ji7@s;{+JI{i+WZ*V9(7pE)S^eukzvKpYuxBarh zU%3A^g-wCkm_SW6)FY|7-Ru=SA7PF~@r*%u)ui$pn|(7Hd*s0I!N zcj^m6bfZ<_x4l$(sn(HK#j{!8+|ziy{fH=8S6eM_EH7CmoTV+++Nf-ONLZfQI;7hD z1G*2l$m5;)XgBA@r)du!G5j?85wo`2{UqZSg9R_MM~Ww8F5{z1Oy6vGTl%gcEWG4v z>FU9pvA27Vb)2KN)Uy!xhKndjOpXf-G zeJECYeB|3tKE0z$uqg6mV|#{{&$5m7F~y|YhVJ`}KilLvMG|P;Qb7rR1^XM8rB>cs zzN++f{c|;>lqB|h?wiN{;nzXGoHi4@78~)-W7f}~m)$R4rvJ@}sZppbko5_x-a2H} z?c`ftTiv~C_r{5Xo$j5iMWkO043?9!fBlpvzMQ7Y=y51XfBxxk_Lmalzn4fAmFI(Y zx-mDG)b^+Urkqt}SAmA5+c8IZ3HyTWR9ES=$bJ4~*YlI5_I8i|9Tm#3og6yuXDiX; zVFlyl6a*bQszwkcCb+wMdxRH@sZoBlloL19t7jD8v=0ew{JV6|#`+;am0xHv`(XXm z`n%JJxZd3NB_(y|#7<6|=>(-$FBGh!GdE@>mtrP2Etyj1HFY$oAF_O16&jc;F`b+j zzRN{+C$N3(@VSJIdStXF>6udfL~3>IqW<@v9iEX^M0a+HVC5hRB9JEn7xki3`W2I; zhfr*fwE%US7t#ND@DWI#B_vApYj;rnv|E_Z-`bt)AI6c|Gwro<)ta1hs23|)J1vP_ z@$P85f-Ar|*S|2Ze+W1f{nHNCujk|9gkGzTo%rz=t?F!tR?so|eVXGS^$1-r>AyQx zVR*;t?(RF~BsYbd`$g?GO_NOD3ko|=b$dOl4frVxljX^4^+&c&@80`Di%)!I^{;HH zL<*G9ta|g;7en}3xbvJ=?C#YHw{@`XB99zzXwu8sil1C9n9luN9Kb_{cX4sgtp!wJ4!;NAiZj>_pd)-L&w<9WHfsDbHh8r%k@lP0+#}$7`EoQy;X;$Yy_O-P6zjDZbgVcup2~m7iA_X`SHA|hWj11l9XS_dp zs_sz~h5n)_23n4h4Q0T;OU7-HyN_mh`%u+06uWGs8CQSvI`Ukrx?v0vx19^SE4i=s zM=jI0EzBDChbiA$cV+FlapU@0N7C58TVl_N1OKErC17$;TJXx8 zCJA}5-jhG(J*QZLWnK7UNGURrbMTggrmxs^j3-jiz-g&{*&v*zu144lT!&&;2bXJ@ zb1~gPWtZE1Nv`YMI&0^6sEEa8Cg3ivV!vlxE2=2-P{~(lxV@Sdv!u0Bw3+r(K%*sz z-xwA_UMtRfT+&u-CW9pVAT}k@^`nRf3&{eE%o-elif^!uXlQpKs>ugNe8*dOj~=xv zq=aOQm|IKME2-E+F(7(@8R4?z-t*Q_1rr9zqjsBJ#vG~PtUu+llo;~)Y_%SBXBrV6 zw3&XhrY2B+Sns;Q5cT8pla9TN8iDVj>o;jig`M1o5e zKj*@P8Y*o%%qXl9Sn|w8`;DBA2kE=$|JJ0q{bHyV_~#EAJxp^|Kw>&eL!;9ipOKLh z7x$`2EB0vl$Km>VL|Pi1we=YFa}F3qsuf{#^YFB?sab3efu*6^_Uw!|lw81Jw!Xra zxYn69{AtUtnH|Z{1o3pm&QSZXuoU`jD_WQbVB!v?i=T2?5`EK6etUiOh9>ISx(8{R zYS3xu;%1Pm;O}6@T4`lk7K}D8dRj{i^x^ z(Eq%Sn{N#khM)$RnaQDjB>SzXh*-dBA(V8Qh~-zYQD@-#x;|Weu&_MTClpaoYEZB) zuyKw_T)WJ7=VC=Gfu_>_-4BQIHmH>mKnV^Wf`LT-!a#pK-a} zGCVttlI{YAOp;x-9z|^pl{V8;!`6RD5!bp!(>>Mw?{#%!V>{17Y+n^qxdZ9pd3Xw# z84@K7kG6PVz*$HA95*05oGNOCi_7h!pDSn+n_TP^)SbIZjM(Yvb3nWWc@$u_&MDIK z0*|{8T%LORJUq82Q4K1%d{93+ZG5SrXCP-`@(G_KU2rk z_p>Ja^Jf=O2a+ehX>5FwBI4z-{VHki%YdU3?#7fVXh-x7j@}9$k%V_gMnw0O=* zIi;^(0`={tNWgx=u!ivS{UUT(nvt%q7tpz+CAS0>6vX?~LEO;i&nhw^yR2;swq~3p z@fXyzoF1Q6{SwOlo*d`pEhO=l__d@7yPaq%m0T7ZjAUlETUpngpR@LKOOKE3WPP>R zURvS=cqo7%q@rR#2|z=?F&}hS%jVZ44J&I@?CHyA&xEP;HN;wa2x!L*&ul0KZ`%+P z?^jrwvcMnOcLpf)p+}v8DV#W_k?0jnBlqW(=w?z zx~R&N_181hR|Q+o#o&4N@$ReiT_VH85NSC^NcKSObKtHVTgZ1Y!u)K5Pk5Ud|{ zM?Y9tU=OFbn$UX@zSBCjhr%wiRx`r2AYCiP4YGsq%Lv>Y7~l(w4%ILNjt;>k4dL&97h^p^Vf zQU@R&6m`-CJnXfOZF3ih0inpm*x1J%V15%45{mLi{qXgN2Iz;-sVPl>UAVl`(0B*X zL71`KLFxh6JjB_}iz-SeD1^(>v?+m>RZCYFzT3j0B6&{KFmyh|%;Wi^%|En)6nF>k z=^>rd!XizGpPwIr3{QaYn1DHc2xkY&G7OeqlrIB;#FiKC@Q|W&=lO^6);5UrgSiF> zdFarOVP5-lV88|jwUBs3Lqo$!va^# ze?iL59+R1A+$Qk`CM;uZ{(m6z`R!W{Ysngn|MNa`{y5hEc*O9resl0+6c0tdV+5vM z$uh=SKAuVcVd)m#V+ybBFX*8rKUIy67Y>E^8MBOkMMfJx$1PLZrNZLjt>f6W5^$6G zEKhD@GyYtG*xZSqK!^LnK8XAB>4?O2vPW)iS2Z4%D1VY6Rb2~_i z2bsbYg=1 z>Ojma_EvNV<{rt3iCCnHL|?s2BdlRC1=ExFCqNN@D)!>q)aQYMyL&wZTqYzS%n|eJ zAYKyz+Odnv87#(i2mWQ?xTl7O_8o1EwA4cQA1o-E$B$9`v2bxyIL+^%GgtGE-iQYo z1scx#MTj7Q&l(^Z%AlED3l9!H&s$m$tB;B_3855`tj?Zj5+f>Vy|lyl?H64Yh;zqJ z%EBxTG}B;v?ko=S(~+Qmh@=qr0z1(rX+*-}6?VNjX3c|76XWdyur~CXZv1PQemKjc z+W-Ei)&=Wa{zcUI?;=dR8E*> zeu>+B@zGeWuNa|r%LHO|7Gt$*$gY$G19ZXc3&U-IT> zX-99b6)>Uo%OJ}Z2FalHLhFP6mOP-ey4cX!GI<3AAQ0Pmeeh;C@IydHw*a$Essa5W zkl4Uvt*X2nXyciHupAlpUV{vF3P}7A6k%n>(?lUoGXOrR&v>h@;cWq4FNkwU$AO6! zAeR*Jd|G%xhma=b14%y*SXh7{nwFPWW49zZiHn7Wbm=FkI0*cbhYwr(`@`0+uixd6 zGk`(|R%+4M*j$ZM?B~zbSu4nR;a5P(^m2Y|O-2(1lBhAeC8B?gWc2p`_nI#GV6DM( zj@h=O$d*+g3P{kFjsicjBwU2u4t2c_EKo!5@D3U#B+6ZFosK`bQ~BIQli1R9*7KKu z;ZD};D4NIQwrrybeD%A7ucw1%qi19jEI1x?mnQ!0ZcabGJI2Y5?>O)-3iTB--iET0 zhB%qL>`U1(x0KWd4PVkbyK`^?28g79FbHzId>%|pO!#;~R4zj2KP(mkdl9KR@eL7m zRaKpGvrnLYf=OYs+w+k7_=JSWx;Rg%JD{|c-j{yl4dw$kxl_PQ^D=UexujkrHD_hE8m1wG{I2^9Z+vDMRom%OU^?A9<;hsp7HM9{v9Fah2aCK47HuFYj5lF z929q{gocKmDrZCuojtDnOVZQHudXr?Bvq{XydTSU`!4F@%%H{xT5lJDIO_c?xijKL z9c5)Co?nmdV95H!kxes=jg3)JfTczx0}HnvWfKkM?*f-sPu`8h$cTuuJe}ZxZ%-iU z6U+uri^ZG~Q$s=E>q6==ysFr^xEIFu&cKWL6c`?kmWL`+Qc;022b+--2*lvr;CVp8()4x(bBJNj$?Q<1gLF9kGHo0_+hh>y zzknbmS9y6vz%x7zYKjf`XsrEm0x^sVeFjniQbQ2L!`U`8v~QZ~bSIGP&f>z#$}64A zt;WrT%*Y?MhEndr*CSME_d_MK6!$)I&$}Na1akgeYc?M4Pp-o!BJ%k^7j<}b{i#Rq)FQ_do+?Otf@iIN#Rvki$Zye7T(BW~|a z&d=*BE&`bXe}ifatbQP^g#>j&V1WZCucCsh{>jevHX@G4Ma=H;qemco0l6$FqFUlx zT*R8hoLfp3WiL9nu5a{ z-^rkrZ`Q)X0#vQ=rkTmd$kMnk7#MH+30YzXNw&)`VJQobPu!NqA)w(#Rk7?9A8o_K z(i%$lu`*j<_v5v2F|GiYs9Yk^x7^7_F#BoLK$dOK^2&5oXjZ+}YZd!LB!zE;6Z*G4 zB!8IR|f2eKN zy&aj2)C8_4yf0yNzQrcq-G>Z%x8xz!6_>HuP}uLA@v_wZZ}ehek*1LA3#C2=%lJSs(zj zap!936-#L;2Xa3g4iCIwiq_B|RC15Z4a|_Rg+P4+idB^8k5zNU25tKHCI$Y*UjwTP zBE7*H{!&E+nKB3}Xz$-oQC-ULlV|r(PfkyFA)ZNu6da$&V4Q^8&IfLBoIObpNKj@% z$;`veO_YE<cFOy{rK5 z5uXQtnKJDki2YVk8?UbU+1)K|bic5;c>Zi(X0OHBc0DgW-D#y;jzxWMbFA*~Usoib zerPjw|A4!tn}ERSy`m*Kh?>ePDOrPSwy>}~nS64CN1CrM=u=8sxi9u()79u_&kwri zR3_RiGupHH@?Mnx-??LBA+cknVx$=T(w@_)>iDOtZ1a*i^aryS)IBx7zR6i;MEub2 zIJ@8xrYu0d+COudyA>buZ7NyosQ>NMRG|U2RY=9VB>wt`2C9nEyy>`07_k?A{AAcBw$gGMTC6fg9h4dw8 zb%DC*g_IQ7+sC`RF_(SgoRec>xFO9K!nHvm?E;}z9UzH}j*V>#6CbJ@QcnK-`IwfLz-MydY7FGOBp917)znZQj`GG( zxc?X#DTB2?iq3>p+M>(*+f*k3y}b%a7;pFF`|rPM684LI)lNIvIcvR*9&!)?zqMKI zk6So6aavqFI9Ep7Kk88I{q)=ijVB;@D?hgjGPiOvlQ_A?=UPEk3v-}c#}r@zxMGv+3f^+u_(%R|FI>D3n5xx zX__Ybe6kt~(@`@t>SK9FZC#nvVzt2XxB#j7x1?+7^wU)p4q8{Mok6FN{-Uj;17X;l z5TgVNavYq{?>ZnM4+#pI0>3uIVFW-2*#ml#f%C(lT=olvedT>Q+=}OBXDOD6f87_p zr#Ju~AZYqpnp=7e)WM|L#r$goa5`9mHO0jdudpz;UExEYZ`Nk1+Th#;86n6Mo!}^f zeI(-ZXWvCdJ^;g}FFS^1ch~IyM`A7jvAnvsO>a9@N%akn+&5XqL0ln5-SHb?P>;lvLrFO~{i7chJ)Z ze~u0v8#kNE|+6V2OAsFwj>d%hM&lWGKwr_ZS$yhp>Z=oPh?zh&@K8@{| z7TFMO4P>AwmqMkTw=@s_ln*NaD1DG-Z$mB5^^)9U{8g#Ds<>r1S*% zgYc%&+o*>irS&FF6By!wdjS7idrKji^z%#@Pib~ z`C)uB$fmZ4zX+Xx5?%U0au`!c^b_=5KfM6s6RaJs?j0WwGN7L+T4 zCT6tN9uPcqAI$!osROLcfQ#b-Eq)xY-6D`mh(O7{ImQSq-bKjrJVo_-pHX!b9Aef{ zpjCz2K0PeqpqT|xq&k>{Bn3=g5Q31YWwGsj?NC7Y-?vH_*g(+#FLni zU~nJ60vvQ*LTrOo)w~0flLGYgZ+l`!w+cXW?+HZ%J>mSvIi zTgDGI6WFjKZZ!s;)Pw?|I~( z!aVa7@pWEcK=0&%YyAMr{?dU<)0un3$0z0@q^j~+5qZWOlmw&pUXE^>L{i*6p5Ha= z#M+!MYJ_G(fz0$4E}iP^(wA4#3rtZfhAP~BQz2RAh&2HHbk@LEtCiZfBs8>3Kao&RLX#S9_$ zjdwv;!&Ck6!e>BSChaElI$dv8?LdF??p@fVPoU?82wl$rYTawF4kaWcK>rvU2j_86 z+faHzN(#mP%&p%zC%7@fCcSa60n;%scn=fsL5B@~5!~U2cq{IM6w+O_et+=cOf8N9 z#|8$*NJWLJ#IyZ~hm2}gd=BgN*(NzKchYI74eV1J8DvZ3H0O1{KaeB^X{I6~*IiQ; zbv$nb>%F|ni;BJ$6cG9;xqATbrcu~aP`*0oRYRG5Ze`_L2ZsRBW&%lj{mdLwem*(A zcN~rCtOUG8b#AswLv@F16Q5pu<`(EYA0V_5rp(W6=9xc_|ItozudLu}>*m%z*9skB zNZ-}dmTu{go(>uI`*8v2Da0?k`a*?#ztVjkz`L0x@hp)w80H&ZWFXNB;G6Z%7!rK* zL?$-USIGdSl@x!HNZ4 z@SpsczQrlU{&=D|1=nkp%x2Jt?N#ws_U6{k4#M+wdkVe+Ma475TQ;Z^o;(P(^+RUx z=Rh(C7FIAgH*Ga<>)*W#2(>t{7ro^@v#jDBR=6*RdQF9lBA}jn3VMWE6le^$JuYCLD z{knGqbBc=1MYV2(T{Sb6MKsMjvHl7R*%3d|HmEjL z1c^vc6FD(awiB4Jn^7x!3BJo@L`u0haNYTS7Y6^4bnO=oytkEoSNLswGD=Qd&MEGW zkyT?DDbjbo5yqAllt_O$``L5Po=t0crP-RqWhZ=fAk4z^^tTR-85Dng?>CUG_I#_W zOVMQwW(t>MDD+AF4bV_g0UPpu(3=?mO>ktwJszUYWn?Vd{BS~h!Y`pq1D#Pt=IzMX zSg3QU*iT3SR4V+)yTs_sI+~r=%V_&Z5*Qjw8yhvS6uwUm5zxrqxJF}abl8Y2D?0>A ziD+D)4qV2l#z0g)*WkuOXr=AR1Xe>TsIxsCEyaWn0<^hh#Uv~CzApQ_c2Y@(&c?y`zZXL{3e_C2LmdYxH+vU-o zL1S52aCWxi?yR4y>m~yOIT<-~oGg(a`&;`|aMu8Jyb`3wfB!f+gcsR=+=kU|JN6f#Vr| zDX^d2g}tC&((i!m`N*iKp59*Hhrk~3bLAz&rE|8kV-F~aj2wml-j-$XimxN_2;I*H z62-B2vK4pT#&ElShpmL0+c^xfA%+2tAOd3-Fcv`b51!)IKI#tIPrnSGW+0>@k-0(D z2LR&_I)j3PjxM^;6oD2zpbIfn?t_9szvUJk%HPYg{UzujL*D~OQ-&rg9evdPU|Grx zDfb-O4={j%h=Rd0Eu>rf=gKdY9)C9;HvfFBJ62V=Bxm!=B9Zl18m4OeBvo5gcCG8~ zprV4;wH?T~0hy%Dlq;upvC74SI(wfcbhx-`$N3*tkw{hiK z&&yLquH#&m(YcM$)#mn2*w~smp6n0JhbD=HA8#Dmo&S6yt3k|Tzvoo!T%ABjc%c)W z{y^BB{P(No@|l^EJY~BqufK$I0Ka_awp5pPlT~+~UVl~bM~+W1qMk#@Z$Ks3m-f<0 zyIRP4Ny)}?X_S;tO=q;#|CJ+-HhWv2la+|F++0?Akq2tRrUPDJ#pc&>B}v2Q-wOjz zR@oa_CQ^xTG^_KP(!rv9hv8#VW0%6a?S~W!X|rOQ>Lo&5H6!(*J6Y}HXSKZdw+=pL z^gX-JSxrhklB*@JliaxCVxl7ICRM`CF?BIOWJATuY6GK&(3Szw_c_3!@w#$fT!8bY z+5Ew$7JVoMfm-DNw)r^Q8rFzc=H?q<*fBRVYo1BZ&u0uMf^d?>-^g$)K*$d3C3h~(#pR(kA zElG&soviP9Xydf)S1>=j?WJgg>1Q?{kXHf?AmyHrvohuC3))bpCSZ;XatM zAXH?2toWus$dHn~y`LJo!;KP90Z-C+%O&LHi?Z|#kQcA1PS!ip9MP7irG7doV^K^_zi6Pw zFIy70O+b({f>K&KEI8|YRW6;O^wvK3(c_M+%*i<&DN?k?GzP|e-3A789iiEQ{LRgp zvrEii$(Og&y_Dl2S~3qNOiR=HJ;C?9WS#BXPIjpKQ6w+AZH#=lE<{6eWRn6OKVFzN z`mVn<;dSR;VI@&nH>DK2OPrFu52J#8>mRH#8Zn zP1msE_^9awB&?SRW|es}T@3X8Nad>EWka*iC<#6%Hj@msM;$SdDOPP*I~O>?rX0@s z@^=ZFjwtygyS~+6994>Y>+Bc%BaW<42WKN|afCQHpXT>uRzzPONUh{_8s4+sw>ILa z{1zBkyXdR}=XuUU!&#<9o_PQy$x-WFtu7W;ZS7z3jK$q3Q#3KHwQyICPvjAi-~66% zWF8xB5EJvtiM!F-)IV7;&}Pu=jLMdPz`ZUTRedX8eSfE2qIcpniXTDlYz_+E-x*Y7 z;{O}f5FdYb-?7^NQ&iiVx$gmU-syMP-Zl4P_jEO{&zxty71;#yC?d~a8}&cDEpZ9IjuI7jU{AAjgmYGtIT6MNqJi>fRplH)z5 zRM15inmn??CNY^Sv+VtS6tAUVT$r%k2p-(YQ1nLyUq2gE4pV)gbS zb0F*QkKd3rl-fQS1fOhtA^UCkI&*ga`YUGRp^wjVmv?+B>JwMpIil_7%lpW3v6w#J z74rx&8_%NB8dEx{C2v*TjFbIaV_((Ure0N)zS@Ahs_UiRc2D0aSt6V_*HbxuEWc!4 z-`z*{JB3hdXDAXL#k_rC;Jm5+3_kv3vcMTI$cKz71^@%Bn@3hJc_Am#kU50Ack>Ooy&R%@^sd z%NhrotRg?R8J(V2VaG^}*R?J_$1mxYv)l%X1f0y@(^JfRO_c7$9X2$Y*#4nQaO;$I z%CpC`?v&ix(hwitreG*9C#*Yx`v0Npt)r^!wzyFR1nJIANOyOOq>|Fof^>I-fV6-h zA>AO|UD6HG-QC@F7w>z%d(QpFxc=uD4%s~WS#!-fe-&?V8_#rH$?tIS_=~flsNr{2 z91e9m(VVh%GP-@?1?SW)YFDSGOb468;KlqF?PncNWR8^#RUH+6hn>k#-r={Wbpk0# z6z(ax+Hkb2I&(qh;r~8z4s2^ErU=w26uz+4QSGUv#lnWJnoBRgy<%MwUaNU^q=H-j z;TOfGNuUz(B|n&w6;+;ZtaZO~;49MPmPHV+B$b;ZL1hx%E3G6F3*|I_Z;J5$?85hJ%(okvTtYBTtzB9HvTI@Uz=~%SI0s*~=kY z8h#u4p_961(e3thHPcrEWhy%`yg2GUFEJu(n(Q{8XxG)Fk+~^r|L?{fPJRA@GT&7Mn9-E&cV}cIyrBne zaHXMD{EZ53tAB5f2Z+c+efBu*Qr}n)P}u5#ts{m0-2dgXTp=lYne2FbM36cqGcyqe zn*`ogQf;jQFk?MFi3hcwBd%XXRh7q9-!c*2Vv#O0!8^hQg8#Y3Y6XIV0`}V^n7B+G zMuTnKGP1T1W~Q!+CRe{)ndA^jStm~)kPR+!g zj-qsO3uKNzse30AHLp5!dKms(ALBVBh-9Kc#Hva$AMNV|ic94o;qP)+UIjHNK<5 jU1fDu(cQg&ESK47m%{nXNKQrNV7ZkGTu*os?k5??(`BI-7h3%M z_Q=Q+^lDaTH}k+a2tegKd(KXd$d*ewR08vJGj+n-=!6>^{Zk7K?vi3Is03g9;HQMf zM@P}jJT%iRgRVkxucz(We71%=#-4; z;FzKRG?>A|3@6Yqfe8|t1r+8j+ z@c~XZ+UlTiIU&riCvEC)66xzrw5-V(4es^MxB8)`jnnxMSWC7|eNUEgfn<8LpH8`m*i8tgQTSe06mWD#_-=#0d(K#z;|O zV0QLnv9@hajvqGGEQvs^j@JBJaq(u!2&)FC z%HyU6&d>LZjI`&feiHIezS{l5vQ0`#-qscrGa$CQ_A`!YfkW?YNgVSM4sM*dM7Wb3 z!Pn0*jGpD*$lyXmCA{W&nJCdN?sk0~u@e$PqBfz^Oo)is&JC_6_pIRG#qjVhah&L% zEO!(eQ5$JNSogX0N=Ot*LpQ|r24ifj{e!eH^XnK0f!Dl{H}T#=!~Zb1vX#p?u*3?!mxr|&`%R!`~Z{O2u6`j5Uq(W>2(_x{ z272?<*8og$u#|oMXb)hliKkeu$)G+a@lsU=ti;V`O)IsDr#-rXhelc(rWeP zob_Q4;-SE|vYT3j6aKuX10pBe)wMNhP6;@XN)E8Nl#!FO+EIO0e5N%wr$}U0InbjO zrTlazb}d@e)|}GlgrxoGUD98GvGLzHd3Efu@{WIY*5!NA`?0^~y4cosz`0I@6L`o+2eV0nHo5AJ{% zJ#}~4!9xXL+}5j`?D5$Bm#d?}R?0PwX}Zas1hdAfQ!)ygh=28D7!P@`Gf5zLX3krY zaNr&!O0uIdXor-*p-8=7aVp9@yDG+}7rIOpjkMo-Od0ZA>*`kN`>K>H6ksX(TpeFx zSQBP+^IN7fr(PwC4z@tbZTPrTmmFkqJ(l@^LF?Jgg^RsLH{!YI5B&;4Bg$!HyCpz( zZ$XIY#$*z@Dsq0(1&VuM9%={{NRyM3y2HS43?B`x)O3t0vr8wG>iXse0H=(f%CoYE z5i+oCZV!Ksm|uPkQhc0isF?sW5O!6P(<)h$<9M4kL$H5$Gu@(%X`0g>*SFR*phB1! zRYb?c^czBS`j5n(PU?s3sn?M_pE{UVBEre??6 z&(BvB6ubgb>Dk#!oS_#_Yo^ev+NuE;`@tLQ%X6~1W^iz8=HFkcs!q>TcyDdl99K0k z>pt>#cEMhKLD=|d7Kqe(dDP9#!CBTFAv_Zk=~IFrCqsKQhdWn|j2e=X8sX}uKUKUE ze0u8gKu5>uNd|<OVRD68( z|9nkH% z1Y6$ef;d3g2ux4EoGjW?R;~x!q}ywx?-5JphK9d>@o)>b{8%HnS1Jw9* z^d~Bo1E!<2moxbd}Rszq1;7sgID}JR_ByYyPMy8K=JCnO~<%Oq@d}+FhowM zXl)fUZfr6Y=Y*@VV&mPoFZvsn*LOg%6s-!wcab18o>N|3oGt&v3?q07-JPE5(FW<@%KNaP^6 z)-u=Wzw_2qCnoXpog1+Qd0A}}N{)KGNk(KF9O?N51w(tkAQ4VOgg#7_2;10|XN`0a z3wr_Usps^!x{4})f1i|;gpiQ7uP>W^$<*?&ot~x;3VYSn*H?q|_kaNK(Ke&w)Xonr zWlS#buChU7mbnSdC(T5}1A?@#?j*c+kF9?5Oxoi_gbaWzKKBWLEVxHqJ!ZdbO1W8` z^3q=DzALQ7TLSSa5Z2W*MFIlvl9nc8)XmMyUY#GV?Cgaa*OOr(ZDeFZ%qek!k{^J4 z3h@9Qu|1Zt{crY5Fyyg<)`Ek6@f{_VwyUzsNh^D{6NZz7hDy~1{*|5fz_}QHE_xP_>-#NECNC$ z07nG?T^pnt0qu#2$?w2GxUxOat^yi1U_kN!dvSn!1205?tr%bKe8>evs8<_IjEob2 zOA25$j~ljm5J>090s#*pykdW{Vq*8#5?X=Awkx$n6fpk{DQmb(VL#cbiX=i+~zt5zyDmaVg3^I9= z!)y={d$Zpi4T_W-!!CvDRJ5S2VIY*3_|#QKcGT2sXJz7I$4`MMD|_xQ0Zbq2?s@%n zVtl#12OC?MIVSqsm0f!>QcQ^R*S#Ija&CIWL>}~|-}!A*r0m>Jr|GTJN=jwa^2eXI zY_x2^n^v|N@QDe$g_5{`v2pU7)#7 zDcksRoVK`j8=paZ6uSW!6oAV2c)M2uXbpg4{_JB8Oc_m0O{=S`OU=O@3GCbf70Joo zes4Sv6OU*9l}??V|L7MDZQyJPOgmAC1%9QZ$dN^!UtF}hgLNk{HueOFXYBzL+I3Fz zNwij|^v9Li)g+T=h*#iq{iqKzwCzu&GUHZf>si-0N?b z?0Uc*V}4*D*D?629*&Zk?qo@pqtleUD*JyuA*L3Uk%b<#Lgc)LSCVa%59r$OePX<| z8&_aY76R#hrl}|eY6ON8v(R`P+~wzg&wdHVyl|#f*3=L{hBeJTcqfY7Z8kd#1Osk7 zzbHxSc-$_E=U7QAs*}+$(=Ah3&4#+x5=Sc#Y6e+Pbc-`7QUyAjqLiqaD04OituvZF zT`SD>wxXJ?Gk1*X3R0Y$jsO?a0G`Rug4Kx;z>`#hNi7PvB>}P!Klr5~ zcT=3|uCcx!zL5xvv5fc!COO$XTLxXNtj?#pLU-l?<&Ed1Vmq|u;#RDQ*n64u|3`^~X=CDvx%u607r@dR)z>R1hdz<0S9@P0tL5ZEO9 z!6F0y9qcm1^9x`u3jixLbac}JAR}}Kwwx?mJ8j}xf9yVI0y?9OqaGmqrHuiE6ZR(w zi{96)taZSH2-XIi6o7sH5N2UvVPr%Wo1L5E2M+~gFgy@UI`{+v_f68-n79|L195QRcROABz<&TdQqMEv-jmj`c&w}j@`Ju`zpi`3Vo-x(6?wSd+_u8U<| zYQ^WTzo1gpl+|JVs4bcK<50le{^0%RibL6RJGBYBO1{3ZO19Sy`<)ry&2LD;B&Md+ zj%HOVnji1t`g2Bp3kZDed9qy$Dn*oKcKX6rXUYIOec`P#ou96lH=Nq?LB8dmedHT7 z;TToAMd4p1pBsJw(n4R8 z5LwD<@~cUFinC$V*h0|Oa$`N%{Az04)(G{;c>dG-ZxZNnINayjMg=A4fdv$u4hh!U_Q&p%K6+LesX$qs zQOl=%;b17)Jvst?=p!g@rMAak{$2-na0noQK#8*IY7iJt1Tu&K3zfzVd~+bW2bi-h z;VC`K6@Xka2OeDAC>%zuzOMk=2>dVr8L_jld|iD$W19fs5=jJ?ZZPr1?yi~11gPz7 z?ChjX*}B{vM?JWmkmjYVnL=nKNhg(9>m69GVO`Ttm_JdZLLT=Va=Ub_UYVYxy~Obw ztnYarO<<}TJea-1^)eSpk@5=W$VlQzHF|@P*Th@%3-glCW zn?0^1bBBnH2dFyD+@w|!Dr#XUQ5HfHA*s9B?E2wV&6^ret+g(p!Ri6Ub@7^&f)--B z(6T(Lj@k>AgWcuCxcUA4U*^3xt@2vz3Nholt;v~3QyCWph==FrwZz4<%Md{g0*-qY zKp>m@Jm8F+7(kHXYfQ+{)@p7A%(jV`y=(>Gry=@K=xCS@8|n@IFK| zoxKPpFom|+#fi;+4Mq0)H&&MK94pfQGBXcrB(_P`=}STzmQ-fyeKf8SkL937Q9T0h85$QPi;gvOV}bcv=EABuG`cT5g3l zgk{!i1PX%|=c5$B=mfW&51uhSd&8U8e~sQux7?qGCMM41x=({xC%`!Rx@Tr;N*a>4 z2{b|*S4q!V$jbM?v=(sPpfzD(U|z63C!wgvBqV@;Hv{w|rG(qIfW<^5pF&7XjDqCa z5rht8UO>_dw3dL(PQYOT`fg6sQHq2d@F7sSz`Kk-AXb6{3=sDci%kmniaUvAKCfII~ov$DGU!m?`xRm=PTD1d2!Z?3dHfO zTwLipz>t;!6Vvm}1SO?7C8fTg;DWEO2Qu=?Tn#T<{J`|I!spqvt;iO)3kil;4s#;} z9qDYc`0qHu9og9!z-JdX4s0_ez$(~syhmYWu_Kuu^=RI;YwK)ubWvQ4qN6)7FaSjo zZDd$zdIAJBfH%lT8BU?`6J30FVn6Bh)+JiBzR7pl!LScs_79*?eEhuSm)3dYbQ>7w zyLKJFtFwtW;=#0_Y|Z=^pP-~k@-cQ0IKY4+q@W?las%K+Xk%Ee$4`DDh(5xA5Tq4Pfy8qUS||Yl zfoS$QA3I>$52hp*K0XieA8|{rnEwFS<(7ivRO1X1i4M$$lI}c7SuXB>Fq3 zs-!-X1;GyH2E~fk?Ofpa=0{A-Qk{K4JiFxV>}?p)95b7pZ;SV9n;q>2{i4D~Sy_lr zKX?Tt$tB%}6LAsT5>M7}7n>h`y?JeE8K&K^2?ryVoz=KvBjxBg@#l{Ya3}1tMcOxR z^_7C5RhfKM&y%*@9v}`FIro=yAEMSh8eMhvd{&VGk?hAVh}=L#J39WWaJ&D3<%L+Y z=UWA3R;-(DE|`?Kcwapf0D&|}znSmv^Zf>Bf-ee~Q`N03ERgO!0D%kS3&BAxRa?%1VdEzSWN`mX@@0w}@VJi3n0qo$KfC%}&ds9cnTD)c6PnVAm zxmdWmfC+pWgw8sa3pbQl<6OWJ^Q7z4Ixy5=4zav$qFg1RJDw;ohmDdgI~r&+Qr3^HP4%R7$GBqV&XEA&pKg4K6m4}V2?Gz5Pq|h~Pj985yw95dObuIW z>aPx4_3Hx3UNNzqogD)M14vhjfo~%H@$|Y%qA`ZFFXnyQt&>Ui-TSE0zBAdAE}9QU zX0vmHMFvlgLzq+y_IBlAp`kC_1RS=%b#!Ec*+DvYfl6 zq>`|na{^~+Dv?V(0#frOA2=_BV+3N+UC8~w$_!t=b2G-(gq~GyvtfAWf zpghJn(u;obEQ3*h6=3*e<;+Ql%@pr)JE^C;bTC)_uwQ*IE{k;pWYo8JcRZA(0AY5xSF>qjI*KJ%JLe|&r6lL0U?`0(f1EJVZo&8gBXW@7cnMZ zt?Lx{R}+9zzX{0Mv(=XUD>nJf+^^RqKO%Ox@6Bk&zY=dJv~b|~rZTT0t~B^+EQfKX zV(ZJhwl6lM(DuNo26!$XwfU0{4S{R#3>bfFDJTR*Mph+j@A)HkOG*^GoRA%C4q8&n z>!2bf2^tU88V!CwTyRIp@;qI?j*hM|9x7Cl5W2cbb$s&bt9Kl;v`C;;gx3d>W@Jdc zq}cUv67S{Dxn)NXQDZ#6Ai8-Z)qK1w=tI|hA@R4{+P?fWbbw$N&m`|*ip>oJ@3vfl zOm&`Cfz*av#kV+?)mL5O#;J;6_gMFy3JSbJk9XsECtgX+39jUtVI3h^0~q|;n50K= zhmCBl6F*SP23dK@bMasu26#d~hPbTZb=|VJ_t+v44x_fjUXAtfuh?CNLT|QfbjlxV z_)9YhIxH6YI7K)ON>h%??_t>R7DVXsx~+|LEx)|@cFz~3P&f&v=J$4&hf}-pDc-Jx zPMKm6PgV(p*@Jpso{;dt=J>6w?8<6qa3Z&)sTqOALOnpT_|2!7IN0rR5N-$v%rrE1 z&o*DhCvZuiv9X@~VQ!)DxkMq}=W=s+0!-2D+?LRA;)24C#xI!+qTz&1}i38s#AT^4MM=O&>l1`9UJz>1W-70k^ zMT2K#{`vRm>Q*6o`3IJ$B(hC{;NK17rwh;vDw5~1u6kZt2Cw~Jezd5a%kYJ>;jP@YXs zfxHg?umny)+ zv!1P#{<8izlKcS+5+uA>vgtifd`^!c-eI@@Q-h~3+8|X-0lEbN3o_2RC()OiklNtN z9M?sXWkm6Cax5-Ej7K8w;%q)wn;w1hWPZ>hmh2~6d&-op00qlX1l(&ZE<(OMb&EhH zViAiv>(o}9jV9xf#Nj_Ad0oEz3Qi`$`!Mt)MEvZ__q4kn846+xht{(>UR5F7!3NH{ zO^6FN>Pj(Jm!2b1Q8lG6 zXwtvY>zL}iYY)&Jw)t2h9(svMg@n(le;!Ack?zRE*>&}&!DfZq{q7^PZZ0E_*Iq}F zq`#p)Hj}Yl++=^?#r=%s9o?m*uI@0-%ehMyWm!9}-vI=fS4jy?XH$5>&NPeQl{)o& zdt!7`?P5Sf7uj1ct8G4^F3f7xHceYFEG*?(YjnCECRvlz%-PrU;Ol8@)#-17pu+d9 zgz#YU^N`K(3H_zD;6-;6_RrNQrlh*mQ$5iyt0hyusw8y0m1&ND6&SKKUZrP{g@QzLqsQS-KI(ro2`=_vyqh<+$_?hJ~Ofd=#{+%APF3 zoDL1BC1PW+p5|8B5=`^eh8sLXN#^LlC+~@X3UTHI%=4=9@>83dz1azk+osfGD3!+y z@nE7^%~?mYhXww;wI775EA4S}&PguU_Zx-nYD#S7iI80wo=SVGg>z%>15JnbAL>qY zI(pF6phe0X-gA!g*l$_eU){rYLO`i%hR`fBEa6y3{UOOVtWGZV`1mz9yk4)j^=wV> zpv!FpN466P~`jvl-u^N`HF~l zAZ-JV@#B^L8Rc|_130g*+3+cgAQc&pBx!gbJ|dClAe`CNXnCIY=Rp=UIS(d^( zJ;&dIS{VjqHO|Y>l;sJ>cd5mllw?Ta;%qb5Re_gUwVFD2)ixxl#j1`{jgm~RGt)W~ zS^i14xZZ3i;Y1<6nd6y7H;2ch!qqAYjif(FUSS>CY1KDmHF`%m7~A{Q{&q8$fRDO( z|3M;7UTo`>hvHOChXeoJ`;Mqo&u{tgOcCynJrhAX_WcNF+&W$u7%iA9X9|Qw`ylOl{rP?dHQDH4g>h?T&!)OUZm}BeMy6 zm63&rPfb_aa$|atitl1Q1Z_LswbQjT8^#AZ%ptz)bLTIemkXAA;Vzor)uR`)5{t8e z(av~jgy%PO(M}9}^0?3}tmmT9g7o|dk{41Mn_A8ecYTwrL%0(cLXaD2)nHzVu%5Rg zZoCtZcH|{fqxf@MIv8k&*YGM+D?_V@<`tKmXEasJ=%y-_MGi7=+lIZ|L6gJXyeDB2 zcBiZ9;^KH`w+>w~<^#(0SqWha{zk*OrxRB@Zc6jn`r_VzpxR-}fXmX%=(WPPmgUEi{-?-{pDGq#7V(kwyWKns5#=d-Fk1mZ9%>X#qXIw1})Fo9^;p{y86 zA2&ijh}#j6*7At1WQDAi_`-^~ut_$KxGncqlj{l!3FJoTZ_|78$$O@!#ESE=)__ZvCi@Oj}I#{E-iR zyrnDTWnXhSwYAs1#DfxBf& z9>?dC_Qa;p*q^7+IFmSM_eEZ7G&PR+ja>(On){yI;1-hhrm$g}=MhRv$9|t{BEOZ_ zpV1#x0XernE1R;nuv!>y4R#mP>CUWLX=0;*GO6UqByC<{IpO zbHod%L*C)?Iyo-aN6+{n&y?D~VOzsATVp@d`fW+S5;AVE%@^5q3U+Wjo=5&uzJo@{ zj@C@XPdjrL!chNxJ|9DB?!OS?y&%ITL^CWt@V4+@p+I+n?aT+46 zVrHrRPSj-FUSE>?$UCMxQGa_ev^v@E`Hk2DZ-kuxDx-qZ42yer_WIt)9F?|^mx26! zWZ0Dc6@#$H#x+TLS!Qgw?tx&JfkSjj#pE9e9N5;-Kq4*oRgck$q8bbxL60U1m%+-5 zRzvX&*Gvr6q@)IG!_zuZHD6p*+}{NW)okG?L1U)s!`6K-49)^VsGB~0^tA7G{=>-FhpBcO-{O1-|T>H8@7*(n-ZX(rV zL1K*0kALKj@;W@DldSjlr#I5}XgP4M?i>$>yb>lPjet}=hzXM{5zW=8G(T65teh8} z4N2Ca+#wk}!En>x_|A7YEH=}67S9mu5LEAl;?`}A&UIb3PBT;5>roR~6$*S*m#BQ2 zBpdF;&)qM#`rmZZve(GK+&*w^EW}Em*h_S3%QRQt&bL`2UKutIeX=p(3rN$CkCAL8LW%4pGO zqCYdu^i!l{ygbw2wnE}iwBB_4H||!P1gf^v(dim1NC$wuP(VMu!f6wU3ZtvROLAZIm8^hB6V$_Kz=y6HT$=ib7JJ51)t@|*EKOs+fpcsPea%j24} ztL1&>HlaZN?;B3W30&g8#9clKYX1z~dcSRO*u}Ve;6?*3H*l zc?_ouGd3c+f=!1V7FPHh0o~#R>VJPf!%6`)q%Jd>&Yr3pir{o%n=KOG0!qf{GW)?6 zx3>zN+OJDfKC{=B&U}i7cqdH0`uO;T-0Pu=L;)d^yszxL{zl*c6CqT~{dyGwT#cE< zo5+uSlr>6=6e)F-v5I=3Y_!<(6+dN_{P-|*Wq*@$Y~#+%R6PIq|2&V*#5chMmHV~u zdljRL1!f5QP5c~emOS3$eG|D`@s%c(6ebr9Q#15(LkHGEAsHLDVU_la8*l@7(?x1< zA0#>hpX~Mqv}G-;aFP?@WQa;Vdmq0?7vd+r6G4T!%^Y)Qq7t*ZL~y_-I-p0FhR5)} z;9(r+q%$*D*Oa9Hpb=>0Os>#KkNn@q@l>MBmd_j5aTeOITubrSAd<`wF+y6AV{ey( za?92R_7{72%zMK>O?4D6*FW(-HKvKlpwUE7&~QmH^u6jdv{*FdI2!G?dZbbtJSs= z3txSnImxZ&{%&}!ie`ds3p{*TJNQxwxiL2Q?4sh0&zno`(V+$M+}NDhC=u;{Il|Y{ zylMOBb12vxi5v;LO+t`$-|5w6xc|)LQB_pv+lZn_bnrWfL952iv_%Of10FowU;Ecm zggrRh+?lYBtfBusB6V(&65pm&ocamtB}>x7L}cN`?FMGYo8!P>>-1kPge&!g#}TvmY(w{d>FZ@YiZ zYG4oj;UsCqxqZdfBq6<^`e!{-h(ShF`PC6Z+W-AqyeD*zkDs&i*k7b=zGNhD4%R|0 zhqLaDHq$zR#{`8Gq7I)DY!F_4eb!>PbvstiCI8|!&)XJNe*nwi>< zE<|M?-bi)8OpT>mNaKU4bG$U` zR*=!gBmJ{%76Z~CpAjYWf9GOCQ6M2nrYhN;%(uO#Il7N3uc`9~OQ(a}VyK~et{NJq zo%{BQpRKQR&!Q*#1Fi`7E!ZtiiltF?wRPC6R=e7n7dH+<@Cqrh=G{&v(>>27ea?~# zWeg-Q6+@i;eAwX}Oo;?Lu-m8c4?IxnPS!7yR!4=17(&HIKcBc0Y9=z_ANG#>-z_{h zr?V(v2BITh{Nn?U!92v=`Tku~ikaz0eDM(hAvhEE_BTkB7>jen zJ<|cYCA!)K43g_j`dr+T6A4^BW#;P|mZTBGYrccAVaK~xhPS8Mi@##v6wA+)T?vm+ zV(lfboD$Hi&5depeoK}~ktwgpEqHjW_*Cq5U0m3g=HT%Luc3~zQ9nf8U(0fZXB2(E zPAID1~!*7_7cjdKY?OHpQXp7&o~JyS!uWeyROBYPw~2`$kmlBa-P%+00+shpB;S zRaLFnRHdGkAK?!T9sY%)@Z*+2qQ|Q%hMi=3DaNQ;pEwQe?@L&z1|gAz9)=w`?03v1 zd<}2KV}}1$3A9NeV_UOcd?p~QjdI~pomfzT`ftHgrb@;OHed_r?PJ`uv9aXzQI@Dc zH-snr_$?&c)YZr-yO-ugp34{?)&^UV*3A#0j)B#YO3ziwXs`T0P82PboQr0lI{&@Z7h8NSrAIqjn& z>Aa}>a6TN@Pi>_hOQ`Zo6Kh3~dF86EH&HHS(cN^m{)CPIN^~q>L^N&(e-|lL=&iJF zXpV_bLf_~zXP*3MTxw8cP!>g;hs(kHUt9Z*aj|&uJ4)pmDFfvJAXT%3wnNRN=Ek6$w(|WwLu3m~llq`-2 zKRF}b$uG{Htl0@#%$yIDIZQl#~LNx&QT$^1Ov_rZ*{ z9qxW2Y&~I_6agmMT+jR)mcGC!#UR3uD!2pq#Bqs{E%#p>-izS(k}486qS*ba-<
mr!v5*oxE&__`| zl1y$IVzxJ)A0L({nkwg^qH{TSbq0-@vLsD}H#;Vr(ehS1TX!r{hg|EP>{hVuf^6FL zHtFD}_niIX{v-XR=l=IW2p{TU&RFTi<(LmM5V)Wut_J)x};Xzf2Xf5P||L6xua*I z9ew$Cm8%H-8=wQEnd3sV;>q$@A;0GyWaq}8B3G0D84RV)^>FhMLaC_9VA>(_P!RUtEWk#XpWeA~Md!SMMzvZT`p( zjmqJ@*xk2Uo|hLZHzzgB?A^|(hkRLyXKBS4kq%zVr{Pvy>9*u0sf~_T zr{t-@i-ah-Ip1*}89`eN|DU$_8Hy~{!Y3qTP+nnkqzIbv#>VJI5XTS}ZUG+seR00v zsElK!xx^`@yQ~3-T)Wh0c5+;DCqKdW@PTiSV246xwvxhBf*jG0Q>i9P(7l1ebn=f; z=K5<5UEP90F%FB_7Z7TCn8=dUrnTKN2-xtL2TuC6yC1#hiHb=eie(`%JjynRi+CzC z+Eq0sx%(6Em!;2T#Vdy)ZzZTJY%VDx_fmG`vgzQS{LZ#^{ZXY`?x5kd*?Ph0MpdQz zw^i-;|7{~MQh%_}azx)U5~sH~=ifMsqFJC!4qHX}k)vH07>cJfwY1+4Vx+ybF-ahumsqU&2& zz63W=DZgcqWFWQNL!=$q`aBc&#@uK2|9iHT?rdN11NcZFh5F z-1%Yx2D)COip}ax3GG&%*=x>kun*+??x61gRp|LV0+@?(L$F6!&D5mef(?I@XnPYf zQd9YR-#QzsX?)5`iQcc~!M?6Mq5wiP1efNb2J&0~PbaNRmCp;JDWn?Hvc3;f^71f6 zm8J}=MX`0t+J!`(y-s?`6t-rlNQ*aS^Zp$}HVf9PIoy~aTVHLct~fpT_pAD>iRt39 z+JC2oss3hbyBrmVP`8WlMt88LcFvLH7+;xL$49Cu>(cz%86#5G7L`+u`TB7XbS223 zEBWfu(&Lrn;0$)t#|_RIZA!t&y5?&phFhP>W2C$9C zchdDy`NP;)bxQ5yepPsa&Fx!B6>OdL&CGrY7f$ov?4Eb`{};@-_1Rh z?Y^OrRsTtud+zmYjmbQvT~$%uXZ>1vH#w2Szv@VxWOojTI};ZZ4O*a`cak93r9CBY{AW@VoAKc`O~vO5h4(?4J?&8!LX2VU%G zXwkt=GDeQPJZTuULy#LLD#|W!-I!F$#ouFQBsMn9Qo}8e{ugouqq(i4lXS1E$NJEu z$ES}ISE{ZGFr-pYjcGKy6^C+lA)z)McJRn*oksW7BIMavWq%z z8ipHWHQfz+YghWOVnebm6A3EWOj+9Mnm&_j?|no}*5KC0*jCY^!pSd4xR#8#+9q6_ zBEHXmT^j(~al|z_sYNg!L5H5bwO*52NyciCW?;_#o~V}opO$q2w5(f8uhc|^zP-Pr zr%53DLw!eJ$Zl{Cyex9F`)Z80tkZikA;Wn1V;Q3CbxAjsAmg>sE2-MEH~s4l^!axO zGxNvN;os@46BZ}}EisHxxy=pfyGMc$HPNvDyqSQ3Ae8!X^gtoxt7LS@Z%ydI!h(;D zwn&UL*v=`9`I`MyCz)>8AP=*b(QGzNXNuOm+jY8WkY3$u{nRgJ#ZGL|wQsj}MG%g6 z4u)yb@noa`F|=ED)@8l3;fkLMbp$7WUar^1WU28Y=tizc_J0N%OCrH(>~{5|P5Ypx zxWmB(d$+6gR@jRPHIjz<&Fcxx6U?s{BaV7VFwfol^P|YM+v#BO?Fc=SejlYUffbD% zN)xhrQilY&f&NT`-@69ox_QjAst7MT=7F6e)3>>?X~UaMX~;o2^%v)Zm}1hO^)wj9 z+$=1X(|937`2Q&VyVvn*VkgZb(j6q>~~vo`ObER-_}&v-rkm;EUQH<8%FomA#c^i03$Ls zvB_{bx=uJ_q5{&{(NkdpfSX=tJU{lntkoRl0-o(1ENllUVV7x_%+Bp&55BKid zh0~RzI;W$wh}_Dz>BtdL1LOoQ7h7h|JIiAEUo{3Taxm`+we0uzeduWdst(5otnE^( zKJxd4^qty=NTKYE*C<&`^I?e!dJuIp5~`|_>C=nOye98|H`U}8eaMvsk!5}8a)mN@ znt{vX15KLt!+&Zycy^lMJ%*KpqUaLgKk$4r)F-Y!g)25@gR65q?#w0g^@rtO-!s@w zKe!NTNouPMEOP&_m%=DCG=aT#Q}v2mJHMdsa?vE%HGx-AzR`L!%Huoo`mDKBnJg1j zxfLF&*xt8i@KDBQUgUaO-_~*BPNbrSNu#k1Xxxt)(!`?G6eks=i5QGqkl&hhndXz_ z6TI{DVu=9DEc1V1j-q_hpw!xxp&PjVZ~*>1op#OeV?N37)ZbjK54OM!BC6xp^)i>Y zJgq+t0diHf1y&uE$041flUcoR@5bWD%TiP5J?`-gT7q*MLX`N*$0^w}C+2zKhra=s8hiw6ILMeWLEe`l z`6&N$dxLiVN%!R{5i!AsRP~Ua^?@mrpw)mEzI3Sx23QgOqk_reo;W_^<#X66NPWyYOd^vJov6LaLMdK+ zULa_|gR8okaWTZWE)t!HdiJVpkA2o)_Up}Vf*Gqj)!8e$kIm9dRJtVC2XpR~iS@A> zOF86j5q4L}^V}OEyF9r&N>gFN_Q!T|ZdUUp6~?y=vfi;}_h0_sNhcNdk^$$vK|m)# zJU)!XO-~lqb_9>s>7_s1$fLAqY`DYK9+&5G25gu^H`8(8Dl#B~{I@+Ux7+9gF8tUi z@4b&kUWCiA+u_SX4{rW|5y)SH^ZA^Ok(ZjP3p_hFr2n}f5T22`w~egIRFPqagtVD7 zkZ#q%LRq5QxxC+}OUCPiM8r?`hXo>-jbOhEKYsm?5PnXuvWh!$bW(&%Iy7c%o$6tB zbyERX;&3q0I^Fiiu(T=M=kou5dPyzD_?^c<)N(mJsgzJpz!T8y9Vg7w=)BAToUarb zh5w}${IfWsS1m+>hbKNVm1G@oCPBjzZ~ftK^teD|rtHez*!X%H-W`*Wap~)4(be%7 zdx~XSj3_3Q^NWRMOU?iNmmSSvxgp@ zI2ya=v?C1wKVaoX>cT?1!cdu(^iHs{wb?~8A`=CWKr z;Qk|g#+=iJipmC$VvfK~0vZcHq02<##q=#zyc2 z8H#zTjN618*R-V-SwI|dcd}Y6Od_BlEXuf*8q5CS?LR>Za1~*~2+sIt)&re|`+7h( zVsJ$zK)wQxcu|0b;Jl`FbLOb8jb?49%P6o(l1W`)>L)eaYYB@KEA$gic~<$E4QtAz zG`OV$MK4fj1{To_cYu=z*|bJk5=!}rDLH}!Y7$KXjRdzi(af$6rHH-wZMUK|Q z_eGxXYctQBhi1@#Qi1iUr}lJV>8aTB{0V|G93hRH$)Pb^MTrtP^d*1a4D-JS*v`Su z<97c&q2x-$txsN?)GQ%Ic!d^WV(M@a|RnEeETiPHa*8{0uwi zyR20kcWrHEz59^-gzjcvyvjPB*1hGzs#$;Yd6|quxRgzYC%4zn8N&-}`s($Ufpf(z z`wff)O;)o?O4lqWb($z7drkwh!h^M{zMmp10iM>Y;`D%;Cv4@QbnB+kP{2;(fU-ne zGnE$moG`}gBkq6|hGHE^l^8~)r_k^hBZmsG8vM(fS+7(ZxB2sUOX?yPcMHc2bsnxF&e} z7(Gt=9~hr;aArxJ5HAquXmMq9GPZ=M0P7M6>zLH?w6N47%csukHr38t!Q?Q}C%5>^ z(e64S_w&m^16D)r58k`pP=7;~vXWf0ZFZB!nzK=XaZ>%zYGYI*e-Ami*Sp>`a<#;W z0}8UVk5hj=8DJ@AtvikAaCxM(Wb$I_^IPhkYTMzdw8tl`mFocF6&Ha$dc;Q;Rs&OA zm8f{wdvP`HgYN8)J#(+^N#~DOZXyD*K|v@MV@HBFWY1c*!3pa`OaN1fJi{ z7j2NR@XaiUmbL7FxbH7N18@?6N)!m-bnN)We)mcmN;j%_33Wx|mU(Nna)Q5KI;pNy zK5FK+as|S_;_@x?=h5I zoOvPAzcQg1Y~pp3^>_|ESKn*frjTcY%D^)C4~X;Em2Da1$C+E)mC!4;{Gg2bv0#o` z#`%SO(%GOmHJ_(9V{_wk29uROUtPJGYTSadB|f)Gf2*56H4PpU8}7txBn&@%Ld97s zYvz6eUVfvIornGIr}_i)GKpW@(Go)gkBddyYslSKg72&7>FAVHA<*yB+pdg^=`P)1 zg-&GY7{S9HqK$#d3ZbCa&0LTo=mwiLOM|t0FNLLwhRktp*ZrfF(|6}G6rnHnOQWBi z02qBw`R_d~yUF1T0+MfN`5btHc!c6{-?&(>WeoiB(69c zMc-tEy+I{N7|2O}-2B@xiY@P3x*@7E=~f97%a6A-8A4N}y}Nh%zm1QH!P`^>Q2mB_VrQ3#I@ij^%s z+*--H61;RPB1s^kf8knpXdn@TqeholHMUyin;IxjRwxKj@o(0=tFqwqwo+sUOQ-z- zrV8Xmr;|?-Ud>0Qh3$X_*Fw1h-P-{mf5;-PjQg}?zH{mmW=fSp9j}zez!xTWR?U|3 zq*m@xi+UiOSu5p>-zvKN+U*$;nn)WJ_G;71+YDFCRT zG}50%LrJa{Dr#2O`~d^Uv%|we?MI#+uV}Z_%(jsx4jan@<2gp_cly6_nc)z2`Sa6; zCdvmzPVH(p2#2rZ4;8%T))wXYMJe~UG7<{Oflf0R4@#^jC3(CP0OU@~m2~yU6VVIf z5T3Cv9ivdaUvHdpH@ESp)qoU;RtIk^Q@t?y?s6ls!h|m_S1Rfj=F-bU6P41 zgc=S;X8Gpw=m=G1y^Vqxog&5he{yjE*Vm(fy12Ftb|AOmwVsu+Y)Npyuh-u4a(&kQ z1HTY=?ec{{<4MFD-U=_jg4IU7Lz$+Il{syGLqX{`0#{#T$A{hHFc{WYfVD42;y=Tpd!td!sNZ^G#oy2YqerEKuJN`ZY~XbDqaLUd+LP*G!wZ#bu5?wO7-~E7afT^@`WG=n672d!VKtSc^6htx?t$sa0!$y4Y0E;zDqk-0xxE@tdu_^S?t4I=`*=i~~=#=KOb$y0XwR)94%K43<(cU7=b~{PHPNC_+WFh2izkyMMa|uObVP%j4A6& ziEZqeXlcg=3lB`T@qBLe7yEaAWuzxUxWApEtD^i(8{}I9vHiX$Vih0z?G5O&(XQ5* zjhCk*sCKx;Fp__rpPoN`dYe5u5n6U>xM#VoJDH->?^|Sspc})ibzKV-VIN=R&2W8| zAI_&9_(+;|5$Ha0Bd^bB_ImG{?x_*SAKF5m@~#4efy5>VAn7q_F(t(%G;lgDw~0z% zf;`-f%dMV|TxM&boGnDym`BsfXCKb6VJ@Wq5nsPv2ZW*sCM%8j_7HBke6yutrmiq> z?H(U$$158{GFwX_dhY?e_&frfZEu3M--HU&TyLyb8=YPsjew#%@3%BZ$67W^j@gEa zxD4GMpwtYoD7`(N+ndaG7)wb`<9iEMrnn4iwkr&7Oc>Xg6<-dOK+B*i72s{2!u%l?3OlK8WO#})J286 zT<2Bl!3W=;A|v$SggtC@F(_-3VHxoE)z(Q!wHT8(WIExmmQ4V50IXf7gi>;6EgUv- zh8~Iao*)esN8%SqbV6hgGG4KKU9?n=OCbO>*WsSTMlt$wxr{Q#ofMsXLcsoG?Iof$ zX$>g^_qRk*(vPahQCQGiPt5xkWAMheR-_STJ(CI^Rym@C;sq3U90Zp;hs)GeHKJIi zI^MD`|Gl#p8yhb}Q9K%IvfA0-nZt7Qp`4C&!_u3w6PhLQS~cE;Hthzz)bQakIcz7+ z*M8G~?t8IN7HoBskRTt3*5fef4++Qkkd_*emW?`Fav%~w3`U^0+ThEQ!YJ7rAqOh9 z#H6IG7Ax_Au5V-#+x?MNSXfKxTyO+jwGd(Y-#TnI)){R#XKPGs;IX!;P>a8S-=5A{ zonw+!(l?ntA4#7YHksQA~G}ybb{ofypp8?iSHE()*wm{ z+AnAK;0qcSbTpFhdkr@b^l~8tbBoiB5+kBaP;E4>GUF?)oGs)4YBO@kDU_I8%Yu!? zD%>r*EcgVvo)ORkNCC@vTM%C*hbVkfCp(k*?$_}gm~WI>m3<8F$3O7@o5KPsBH5fR z<$p*NF|{2%fDGG92udmdO<{3F>D2Zd3glAROeeGcvRQAeRLC~*`DWZLKkDlfsMW}T zGIv(aC!mE}%ghY(*_jv?_VF}}!{H<{Z`-rg@*^z%DW9ADA`m;PYr;3$5i?RAttK=3 zXg5^@y)Js<$mxAvUG>z}S7!<;CNkxO5s~~%n_3Wx^z@h+83pJjK3p}nY}Q*H?4YRV z=psWPA>r?zI9%H3KJ~qT{kIk zifVMc6!>1PafXt7nSUqOPDpBm41u6MA`IHm9+a4B zb--lxe61s$uMO}?(;yqMUZDFvn#7kY2C=`NZ)U;mw+A?mxLt0FJe=d%thr+%Zn8R` zg8O`IJw1IM8d^alArJ_#wtna0y8G!$EZ#M6%n=sc33S~{Kbbb#&GfNus5Z|wRoD3x z_5cu4f0@W!imJhakb%}GOFz{I?oA*a_F)P27A# zqsjvCtguzC&bTx?D;{$m9ucbH^Y#qp_4ZsKxAC|WO?!82rmls6 z#n)2s={(_I@#NB4er zTy(LH7yZ@J&z-?sPL|Kc$Oxz-emtCV=(D=9Vq+@@wO4(if?JkxtE57uvrRhwb?W5+p!DDPj+W+J}lV zq5&x%PtwVCM2&R_-jnSTE-) zu=1U1w$pwJO)%Pp0lP7ru3}B^%6{jK))J9zeKSt+`}+S!EPaDTx(1(Lcp1usysi zSZu7P3u7{J4Y=tNA%wYzcg~(}8epKq@Nkzk)z$gDU%!#WYHBqo&ju#M$CHuknJ>?F z&d)0+Tp!j_t@RKeX$NTM0l>}q1y3- z7tf0}rTnd^P1kqD<+(cnVEbjcAmjJ6C|G z;9HpalZOs`Cvg0HBBQsxI*{F3r=pu$y5zQZ?m5&&6#DAxg$5!|*!6oEL^fHD4dU$u z6(o}nQ=cC#o<>!qs{&MCmHem@D6;v_|3rPuFX*tIg$!dyz34dnZ)76%EkxD3Fqh!{ zqvL;q6+ooZ5TZXV^#?piF=AC*+}`fhqyh3;4deP2kwS34d0YhqKyoJ?z$Mr4e{tHMejqEbpwY_!oF0MRcL!gkcK3hUV(~}_U_F4iIg^JoB1e}pqpN(JT1NvTZ*WaV zmpB&fhFA0Mu+*jE^6T{CkofD;3>7e>7}R=nO1KwH3%JguS~C|32{Z9Q3UCEc%YMKpw!3FXF0C;lXRuU**x&DqSHzww z2B2x+|C|0o_oeP4(i7y-LTi>;WTmHPl&yWEW^vw4Dv!}|cM8$QL)~WbFX`5?Hw{Xh zs{GQ=g?%??o!(?}V8(2u6CzEM`melfFP1bg59UEmkHNj~=B3E}%W~)9|35ABBT>x! zLyO(AJfE%786j|rLh7{JtPBhTP+gYuxIi1_F@e7{yZKx}m&niVeYA@^B^1~5L*qj% zwtfiSP;lz!$rd7goO81QAmhL8cFrp@;hbrYhm4}aORajA&#AnwmpN`X{h0k@6BD!) z6l{!)(sDip6?z{_H4FMxr7}3-*SFsv&Uq8X|IlhCT|ZGZH8p+L>+1u2#EIlvWm3D2 zkGt`>^H;#h3qyoc=q*s;615VRT;@ zhbUQZb>$}ix?bJ(bi++wU=4Y`xYj>2xlk_F5`t#%Hz}~VjCI1m*x+&?cN;nSlbBkb z)(Al0>%~eA(c~HJUgurY#5Lk{Lt9Ayn_*>RiH;8{^s(GF^%&em-ht+R5xN2@K3vPguruOMp+4Tp*(6lt>KY%{XUq(1&{wrtR<^lEc4ukJ384PkF4#FiOQ_dHA7?4dqKq1_QHU>~Tc-?A3&WK%_mt9{S4h1@-WK+}FMQwr6}%UOgxX{kBJP zKlf=OE8v!Z90L~&K1ly?AsSysDKnF--b=fFR84XSi}CR>!N>Ww(@anT_AN!w4>Z;X z{6v98E-G8^C3kmINBCpsEQ~N>TjzE!tkW)T@W_Xy!Y@~hxHdxc!|k>)%%rGY8;K~< ztIDk~4dUHrZ+93?4B<)a26bU`A!R^5%tgWN<`2uXC}3R-K)9@bpAMjr4`R2Y-iC&` z`DSv;y4Yazz-o~%MM9*v zfyK%Bc{s~rAgb(e&Z2u~pePdi=-M?cD+}cPy(TP7GB-z5CX-LmP73$8WFEM{_oU~Q z2KtCd58amfI;)k0%s&JrfE?ic)4xbc%%HFl5`)tM7AvdiAk@Yw0u^{xX81Y3K1k`I{Pl;?76F(W)YzhFiMrVp-*2t2#6-!j zmc~OKNP!TR9%H<{Z^QVXn$+b}q#VLNUIA1ozQ5+i==67`+b|kv%(6pp!fTl#~ z>zQU`Q!Ul`Qbl>s^X$6UA1bs52zh1`DMx9V-DocCNPKwX4jYb!-o`S8^|#OR*gikw zl$89!ypd4pG!vj*a1aig#0JLV6O#RNrp$M9CU<~Obs#ITT1pQ?>ccH%ksevTHWz+F z(4YF{j~hqWzEt7Hdc62j1=*wzYAg(7sfSQD_JPe9n7hwG+vCK8KI}(#3nXN574|?~ zc5%r2NL!c3&sb)YjRd1mqmNhfPp#pdU3Cj+EQ))cR1H~f8b-H)rTSH;Fk@hT#eJl> z*wEjIlRMJ2Pk*!@8rLU1D&BOq%u4%Pl=OUzwsyVjem7T$xg9BhvRp08o|F%KylsO> zNz@?Yj#CgN9>rMSjnGaE_gCZreXDib#BNPia%R^Xed@-Lwzoy5Wo?>YZoM{uSSJjk z;y@5m0B%Yypscz=A;z_4qtwIw{coEwQ_UkV*XgzD2hsdZ@c1Gk_;}KgpI{n(9JYRq zw$xNqVbJHkC=cs6c3(p%sKTn1@7E`jZ>6-hp0Z8vC|if$_|{F_n^yD8;#MmXUp8PZ zkAODl!SGg4Fe?e^URzf?iL$aa^8C+ z9hud}GL}DnytcR``+t#fzdt<$x_bj2)RB7!_CL(b>b82?jEqR-(n5T*g6NBT-h0Vq zQXa02oE4`*aHA&8g51Ea-eD0h(A(+PTQeW7Dm#{N?IemP1QEbccvY-OzocXN++PjR zCjK$((~y?k?IW$0WU_8X(IH=X35=9!zw@kLn@)dN(BZSC-qPMa9gB1FFh9PV?zgv2 zX{u?hG@lD~Q%3_tv-??Dh zhz&g}O%Dd0?KGe{>F=jQnK#Mal7n@a<6CU;TvRVfXQ8i=J~2dV?bnVhyRISKy+M#? zkV`dXKDPGizT&`{bm*B|0KbJ@EM*um<==jspAjS9_$I+9{G^IO1QuF*!L*=WrDAV} zvU2FCYR37t(jwMx5^-WqA!Sv!Eo@wO5Glt~_B4O4VJ_9*a_H%G+iJ7dmzN+|YvbhZ zU^fC$Sk7S4=8**D67C^pv-@r1bVoR=m6DgWSE4Mop_K}uDgEP2hGX)p*U@9la#?9? zm#DV2ck^3&epy&O>`c;{R<_FZ3AFpgjE4M7gEK;)S?nK^^X4@}in4hcZ_U>GA5#q- zB9`a!Ldo3YTxGE*NrRo?ob)$1?QI^~kMTA0ro73jLr{TtLgk*qIEx#LVLfxo`Fyrh zXG9X#b!4v^M@*-uraOmENrMFVIUruZVnHwuQ!PKn_FM+WEdOmFTNX-hj%!Z`qwjMf zHNK5)8p4lVL@Q_Uwl(e=-@#jjO5_Fn9%zzGX?a%h7FutT^+&+D zs@==W(rPR{-Kx9iAp)D#@M`;LF|5kKA9&)uwuO5atIBa@%aW5fYR%5$$UZA8dn(IU zw)yxxlEk9wXxao<>%!&UvAbzq2wnfs_C`4gZr5iijQ`6$?57o55}0jOM&oA zY*cQ215BPA4%m{C?|i=+ixFfygM~bp{B&)&>=7F)Vi`_OSCj42#@b>JpprGJ;t+jn z8LQVquQw5`@LuyjON4W*OT%CulUUOd(sEM%^;Tr1)=S!(fCoP=JrbEzyQbe*rOF;T zJdJJQv$8lx9%e7Kkf|3D%I=rL-2+dyA)8v?zo73*5JGT^I&SZ?=IkC>}F3ZiiwnEiLY0&v3gvz7R&#A!1}#q z!JXRj!PIqiyB(T}MZZLa(n#h7D3r81%?QOc(nQc;YQH4P^J6|ZJjZLVh}mPX?ik#d z@UT6ixWMSv#wt(+T;`C9RlIEO+ztkt<78$SXOZGvOXuVomFIq|IHXQ>V1(dA)~Aqk ztG&v@5new}JatOyy~L5_*~v&0?k05rq-9Y0f^J1EX8Cv_pcK15ie077^7h2rp3Tn` zP^{ekN-RLuL~|kTI9GSZ3w>~UG^a+bQ$fM0p!feM?(7>P4RnMDd3TO}DAl!Z+tYmW z1T&gvVCyK}djxXi<;4j2E8eFo>u`cl_VW7r@!44%fkr;QsP%Q(=jVRJ*~>?} z=?E;~g|v9NW#n~kHpM`~rk#T<15)oYGhCtwg zQ`sk9$q^^-X{SVM8GGOYv82^$9Uiq~AtRNLa(((`h^t7qr0r@%BP`?Mk-j*Y!`I zK3#rX>QlT*%R5XVU9^0Lt2e3s3JsFukP*espp@fuix2Wn4NN2qVZUpfTKTH;?P1;C z2-uLJq>fHoUENTYfD$pW2^{A4*^-6EzHl%MnoDNJtJygfy&B`^*C(y;FsckbUe_1r zu!e`&NqKq$gPw?p)ukF{V5p9N(lKjZBQIv?vN;{9|5`}FA>HmufGaux(Pmv(pW7Li+xavoAYd^^1jLYu zk?venxYrR|=KlGP+hlxoslleGsRZUy;G@Lj^un#l*~1Z9E#%%i^&=dgW)w{>*nTr?fNw$xs$52~-M>u|fXYk~FR0K1GUfK*Ouo+MFF zdN;eF>yuGc<}!K+`?p5FA^Vyuj2D=l`BdN@gC2Bp~Q)Fe*pXsw+i{92vqjmYb8At1+a(AZ19C0A#B*<}yVL}1B^Bp-kgnW~bF0s$hVNsl%(?8&B1uEa|Fn9(q-EiX2HQI1#fQ4C}o4j3sOs9bC2O`it~qRqDh zun-q5z|MLigLh?SCaP!abvYFNh8SP3?RjugUP@0duIGryYqkRg7k*(;d;p&KUju`y z@zIO-ew`by_baLUnna>(ThpJwPk;HbGto`^_~CQy*@pIEbko66m{XV{VN!TV(eR2J zyfcP3D$|~P|1QFozX7I0Ti0U7kgA?D#rNoP=~tp7qrt{^ZE@AVX!j;D=sU7_HKn9- zSe}Ep&$y_rb(eVjGzLV7pBq2Y^~^t=K|OAMSOX`6i}hRU3K18V@%^brbF(=v{PTzj z*Su$eym*p*%~D#Fw@?oByt$ZywXQDIKVyGoDCX?tf3X$j3GVBE@AFLR`~G7+k`_k) zPc}usz)b{}SZ6MzpVz>5o`~z`DncHlVEaZ2zgRO8ws3!=O(JdSCM5WyK$M`iyoylO}7^k&~Yr>wbaO7Q^VA?~r(&f=Wfnx^-9Q-+&ar-D~i( z&vwT`?EHtHPP~raQq$4oI*y-njF~XGo>u15`J$>Y)74#E#1nB3ySBQy(FR5d$Ebmf zuIi85vpU!IWS86Q=NeDFR(4(KB$r2QD7n+TJi}QLqh^02g7o_&mcI$b`$RQinP*q7 zAY`=1e))nLl5`hw_u~L5uJlkuCZqnZ3}}e0agG<)hqFYjHi+@+qrkGY#7FDnq5o#?7F1yfFqeh`j)wc~e8g!`O2hQhBP0{rcH*2nIz0u9FJ$#8FZyjws zV+#^u(f2i*5R2A#9(V}7O!EQ~koM#bc;KiNd;yLcB!;%kr>IPUUk4*qOSD@vjT_ig za5o>5A97ut1%FzLOG~R72tWlQ_EmNd4~1B;Nf~TcF+$V95w*{lf$Qio+f}KLvYKW@ode8o;mL1LW{4w!~9Di%lDHfNcqH0cqN<&>-y(Fc$M_@Cy)`KDQ zl78&$VN(?rrG}PG*>)uoa&nUUh0)@n(+e_t@e$PxgvZC2r&#L*c}H^fi8fez)BK9} zw!9J)RbH;kJ;6`c50g-L4gBXuNV(p;h-u~+h)5Nr{*T2s|A!6S44%~i205`QLU~WoXdDzBAFw5ZBetW4k4DGW4Pq`~puuUVOh@b6u}$ z*=OA7QisM7Fh8;^b(WwC75w_zYjFSH4Igow?+HOZaA%u1n5+aAXwsL?oYS555 zOJE3d9N2PzgC=)Gbw`APVpBTay#PkSIpi+8Iuw`V1SalI08d_QQQ&Na$ za@HhoERBc9XT;x*4pVh5EF4V|(o~%ntcM(e^}hVG{JyR|h{=L!-?;{ct0p4gP}E#S zjPv%+rPUF3HY=x)o{~T33q|BtJg7}6+Rs2aCUbapZ!_8eCGt4_R1?GKk1Dk%t9cgr^jc4zxS{@WSv$(A-`=Uk)W`Mf zkQEihgZcq=cS~Al33>NzG+6+vA}otmH7Fs-Me_&p-N);S+u{yM_2gUQc9? zO&*?G!d&O(?OePbd-tfQ+-Hl#<%>pz<-tZ4yQBzAxADM&#C6Z^tL2iyj5OuJP5l1i z>o)eZx_Kot!^#YADjfYR{1Y>sxk2pKH%;ESz-mrcqO0TGE$5&Q9=)5KEc@}kKeMg& zWPbNEitFb;=5#djefy$h#iQLW>;L4)$|$%8`b%Wx5hh?!Gd~JW*R(Q00(WakwS%+| zt;SH}Ebc|_RSE7h9iDZpmPW(jaLtGfALC+(5PM|VxoWe0ES^`ZNYI?;)jmVapD7(* z;EBQmo+!aafL5!lv_c@rM}lk&MbQzB!eud^Z3Qf`O-u&LfU{3Qy7j;)Th zR>Eb4Rc?fAHa&fw^{CjtNV7iR{3V#%cp`_5#j)y<;UW86pm8sSVVUauFq#Ww4K$WA zz~i~UD+<aBd4(H2dNHjDNZ)dJl9z03y5|HdpSzc<1#KR^07x9d($i@)(g{o_tWt!7p#lk z19fU5Dia4$nl2A}QDRuBVj3wHLmqe%zlh?_Cu~bi^))QHYzk={mBn{@;d62B{f6wbdVAlRVwyB8 z&pl$v$@HHet#*4p7kTgO)l7AcBG5MY7rk~mW%2LFZmp!0k6c{Wp0J2Eo&5hKR=ufA zF4nzwCvv)NT;&PVoK+qVL{3HgGI}S$$(-;(p(NcewP|j2TXAY` z4=zk13g?oU6rY`%j35N7XZSsbDGm=zWxs9<|Eap1sr{RK-Q90hu}0p`IJAx4v2p>~ z1D|K^OZyBXw+{9wC{hx(W%n%PD&KLZqD|2CA zM4ts8uHBoRAEDHU&vJGB{bc#q;~vk~Va0HCe(OYU$nvK)_RX2^cj9Xiv_kPcT}}oN zwj1kYVkqn1ZU4ai9fFgU%XA<(ea~$sLCEE(Bu87C#O0O{1MVDsc4OSeQgE_K^g0E(H}knp(6ATBBd)%nhVXBC zAKMB?^Q{&78}Lr8Y>RD2mXws7tgSdq|51R!c)q(J$jbWa@^sB+8JN2h4}mb9K%2_q zBL9oik>ZPAaY{*Cb;Cff7T{uI`gI*NhmIe1%>@b|qb_WWzs80(+3=M^ zg#ej4a`pDUGntNuSw%{vn~9*X8p@gozJ&!f&ONEM;|z@9j7G{Di@mPbB@E6EvZ$ygKY>*%RyrkAyo6A$xYiT#!L8AT`*k~>NX;YQQ(v*VIssH?ARgxu5%un)F z-WOA`MFAdsIh#K~=#PYrLQ+6ePhMlg z)x12>XhHUN%o677x*%xpO9kN(U_qXugs-}`T@&7)a`@f)wDQ21I6TmQ({CA&qsB~J zQ}^SwFNkOg$^`WbG%52$i4s}k8U6mH!{YrANl_s{=KP`47 zE_eyc{C30s$iPX6Xv4`Q-J|+psDMZkBkWx|r&u43<^)1))pGwqmemKSlBY%u*u2$GYexvo*T$Mo~3_M2OfZ zk$pEoB@+_UxV&hR|G7Fl=eF%ADk;}q5|i6m z!QTG4I}k;+G-+&WHZ}R_^XDeHj8i@NUq#tlx0hS(Yb_JHjfni7uKdh3_$cID)D)P{ z&tCwqYyf7)^fUh|-aS4pr=gKRzdkoMCvn}U1s&odMTKYs0_@^p|AC!j3vB3WDM*sE zlqhWQ>^c=VXmfjEqU`eg?CgAn4RMdsu6Tj-x6vSMi6JTdlJAgYh)|JtG9m<$$&UHW zMO@b zvaqpC;b2xvR$p$sJQDu%;?a_Ph_*IqqXyY-nr%dF;yBoJP!zI+H&aNE#{PI5cO(}g zl_=nCK;e&{XI2jnN-ecxYKu|WtbHIeUck|z3A0G;F3%sFq5y)H(T-Qgfj9O<|I^^( zk}v5JG-Wo(yUdGC1;!I0H9CfU7t$_< z#H*b<2(}FD`T>DA?ON)QprI5ny zSSp}Yti!`INlkGcQ1+ZHbPElIC@qbMh#cVL)(Q@$*J^R@na*1o89wjtTT`zw_7UvX z6%sltkeeAB^LTuup`@>9wj(arX!Sx0@Bpls8LJYEga4G3Mf>^9UhS;>bcOk}P5UY2 z!KHfZl0&+qDjdh}GADe;mLq^zeRGJ3Y_5Fyy*%#J3nR+9&`J(;ofX*Vr2<38-awC@LlT>eU&$6338ifx!BPKNj1aVqNO@-E{|5X+O}UM`=EmlSW@f}VUL)}f9fL2G93fh# z=_QmJR8;m4CS1-czo;rq7SG!=G*ovCN>kNVT$$fy>`iL=dZcM`=#1lM4e9aHRTS7g z2O<^Pnv;r$vJlHls%xd>#vnNRgmNmL9@r!BOy1gC=S|-@g+z{OpK^;+lg_Ieb~g53 zG%3Vqz1#|%=V}}$v=fe$^+@l*YY^%w3mA0fB)vgV$#)9X`fr}PU+{wh&!UW zf6eX(bByPpXg=FYr9OH(7u6I1YlgXQr7c>&KeTce$iqxNG%m&azVv&17d%}yJ`0f@ zFknr3x0{+Xq1tAl0{O@;eit&E^X|xQi#O^i&mXNmKcAL5(J>plWsZh8Dgm+@zq*-o zLv$OD%OFT0#@rWbsrQ>b0r^{X^Yg2NB{=D*G!D10JYO#txKmqTSl@9{jwLx0W`w46#VFIt!CV>UjbGFkQ0sj8|>d_|EEdn;cX1 z@bNAwMf<+anuiO>+S(b|zsbdbR9V<)Gs1LTArfRra4Nl6s8`NT*X+TnvPH}0z`%)G zGlRER+)!aA{*LeNgvfrASg;96N#H)zzG7FZKh)4L$ynt`?vwY0t#9_zp}($Y0;sT` zf1AN{crY`&w0UvChdrS>bEr5`7G$FTSH#BT59CFVQRxxRh8r%#iwm$`EpdEQwu}$u z;HH^U-RtA+-psXE>FZRLU^?N&LUV~&bx5@U>5vf%h|PB|&NHacPwB&jTyX=OybCMx zXCJ*dB2N?0ZyVgZ@45M}Ms#7_jcgnC3RZDr3E}$6(_>#DNTD>9yH6(luaI=XFqW~| z6MgH#H|fJZIuiBp);{$5ZrEvjPL?~g1{>DJwJ3k`|L@E&TxD>xgA1&)hu^XzX`9=< zjy$u2Rtg}_%sRf-n&E?hpgb|VI&?IGKpJO!>k!}aRv=q_y&e&UJdW`GMgvGO{un~^ z;s9a!`1rV5x%Sav+oxBse?Ix;VC!8FgMn;4W}qU?_Be-DY|FlpXd#$JvlE=5q7CxO%s)k%InIhh_5IJ z?f~o)!t)n%CL=SRFV(w)7Y zIg=FrX~)9k8?fUQMiVb(V#m?3(#&Yfpe<%HPQYn99|Cgbu*$`$rKj@Uht={nN-2>q zOrO7w_TMKD+@Li)+{T_G&6#h;j_3RM$3tq2cOl;<-pBE8LX7$;goJ7_-sOrr%ztz#Z0Q?r zufP--K*>@Op4xydiip~BVw2P^)D#{&0m_a^4ZWvNO0Mfu&uJ*fK2sjJ)W<>O-LupO z1n-9zC$CTR#5oFMC+d(NCMY34{rf`C7`v!|wcm17e10}a+`+B8^>zJlW4PkG&9}dA z69fh6@43u0qwu=&I3&R2{3*d3?ysyiUv}%D&DMV}hfmvSqRL0D6Qc;t$Qhauq$l%aa?eD7GuIAMPdd$(*>NPEKupeoXx5Dc@;1xw*lVLIA&@ z%ZCCb0UEmZKgGw8|C}&>bQ?^fxX_0QQnEF70TQINS4%XOUHu+8S8z_O6sI;c482&r zbM#$T>j73)ygic#Js|QkFy?Y$R0ima34+!p=STScb$_vkno8DoZx;NU^PE-Wc>ff5 zR!xlBbV*`^&rz8t{icS9;sF>4(6h@14^swG{cALI8+<{vMWe07CELJ<8 z4|%HJtp)?;@2C3eoR6`wOa?nw#Gez0S#S z(2}A5Mb3{^1`^!WM^i)k%gri}4c^`cuJ)OQ`O#@^k%rPURz8|Cp7>Dhan7ODvirRy zydSR56(M1Pu!i5ZRk2MWMPOa=w(c&;enK|`?)U5sF`ai^V>>wiqj@c<%Ao}6Xu+H) z5orS8!t&Y-eQ;=Zjehs&d-49!vst4dOyw(cz;pNixO&I%I@hRMxNU6Pjcu#3Z8wc= ztBsw;wv#4}ZQI6*?WD2J)4kt)-t&Eb*PrKFH|9O(m}AVr#-#FaREqM^kM=Kc$#qrK zHz?R*ztxeDkn;O?d~mR|@h*y>pacR!W?vs=YO2T1;3{A_STBLJj>n0bDyy#eQzDr` z6A#_@{v0rcT&zk^%7@g_xd)VlG$8>=393xkvJ!(I0XmyvLaXgMJR>8e?|cFnlspvp zFL-GC z-0x%UIj(#0J1>8?2INN(0X!Jky;bh6!U=BBsd;dd5B zk5whdkzoM4(_~G7dfuqj`^9A@{7M$Gb+6}3Zft~LB7<`b6Tupz-EHOmT!m^`^^G&O zrkJuPi?2`5FV~&}%^y}XV^}8IMs6FKB;ORX^p$Vz??Q{SN0Rltd|gKDQL{M5jCTqH zJ#^&7Wbc7~wSY71m9OHQxf6**oN!6LQ&!A0r&5mVe+4xhNnbiDmYV>bwT|jshE8ML z%nS>lC_F!EE@^3b`*U;)Yy=_q4A19p!5tl~ZEdQOp@78R41l9&XQKU{RIQCzExJDm zf(rqB@AsESB>+rxyAS{jW$OaL5u98k2DNkX>h!%jn>h15HSF6EF*}_}j3Sxis zwjVSf6da5<81UGw&b68mQtJ7cT3X2xZvuQ59E8(m5_2yPLzjoEW+w33W%DM#grr>Y zE=PrgDCm_BWU$o_M$53-Q9S&uV_{k00bBAj>j)pOFmVB}#gVI)=UR*#28xKFY+5eq zx;EWFQW|N=S6akhwEO){)4&fm77p&mpb)u>tb`bUTC;rQLbHd25l;>WG-eFkw^VsDkg}>UJq0Kqfb8Es0!_mg$uuP@)A0N-<$%YGHx&W#MTtrr|RB^-xgW$ZO888wy2WdI6_PVyYg{gLHisDwW zMWgfyyN-^Tp_0IGn4NXmb;w?+KGA||} z6>SNuATtu0-B?NQE9WScq+G;hfI}P9SeWs4dVw1+7D3$4_>6ky@4_V0WENa&3S%*! zX8;NSDzJ5^sptuZo`C{=v<3S|Tt+w{fCBL}(m+R8& zma^F+e%>fBwd(H}yB=EXZzY_*W-Xu^=N%Q@h#<^CozP@^B^J|&MLArAcxXxM_a7w_ zfxf1#ODrzR#suY%y?wZlq9j_FN-d^b@-6x0w(55udf$~*);75F!@UszJU?av$LSwP zA~`7uK%u70(fRr1q@-5j$kAHeF2X9XxgC!SWsm@M7?))NZMt6c@Z(%5Fn-}dLv@#y z3clUVZZVpmcxNX&HT}LX!*hSE`tFXfqa(ho%o6bKgco8*CbPA>C@!J$da!4EFZ>`y zr=+j{zfe_Y>c^X{wLc2wU=2)1bW&O{H(L}>?e)xNAP>JxA)?G6TPw>l+;7>|jsVcf zQQhLKGAT8BtHWn&oI}taR&Cj{?>1SJz>eYU=?bHL(rC6k?LU@>fR+4(iP3TLk$>@? zE&`8znFNT15yMFd7)Ym_oHt^Xk$G(J4&|d(ZWO;RzU6?Vc-nus!vQ1qr_**9(}Mku!O}~exyOh23Fi(#SO>%H!sHn?K{6N+88xGcOLUoC*1M9 zxyVl>r5DaDqBZv$3+EjsYvKixB)HPktgfH7Qmu`jn){G2T)YOfPex?+)fc>on*I6) z#WmznKW(p{1D&n~0|H%yP|h6szl~XBeh?jAIyE}117Pb(LvxLf_?d)+!w$57L=^7q zth%#ZSzLgxrk2Rcad&-7@B_o;?FELj(PMcT3z4ub@->qSWxCOh6ElLIn)>2^vokO6 z=~(TDyUfz^qm*j6O#wH0IBC+6yOkH>Bt7Z34sckz-ykeXW^V zQDJVDI3ZzVQAeeY`aWeJ9r<(_n%xkSzNy$q2PT?ne}>Bc znxc9u@zUc11*0!F;iS%NhCk)zCgVQbz?zO|8f523me>UQ$6?)MwSTJ3zkNuAhcuw^ z+fQMV^#@ORwhy=Fb>$s`wmY{+Vt@r;_ZGOeU19r*m-?$Y zS=m(6O(nSiB2x;#04VoLDC46_WM+-c6Q&}Dm&AMmYMXQHi9u&$H3>8JAU@DgzJuO# zdFbn00}yCF1Ep>QGD8_Rx8VEiY_XjMTTP;x9s1t>LGs;s1ypar@$m?tz7gpcmXP2qJ48?CX=!=v{gTpaTmujPrwR$2Hlh2gfOkq! zW#v>wy6tF&@de0-81<^9MK4XQ?dHS3`~&PQw(aBka&5?C{JY>ycNDkOoS44=4DisK%W-K-u^~|uEn*Xp^fy1x0H*VYjN?ZBg$6s`(!jqgs^6g zwscEvLp8K?);yhLEe?KID?E)2*-^URK_rzTZsTE?PqUK(9xA%|BqBx*Q=XW-Bl9)M z;lEDabntY?IUyhqk_8=?FnBW?oA^^79`QBM2#v^29mE@)BsOo_ zdAuImV?@^c1*xL4aWuopBwK&o%*mNQg1NX(~p%W8wOq zpZ|TKzWHA={q!XLnALA)oJv^g>h&?4bDN%wnwV$N4mJ$E9d(&(M*s4Ps=%nD-hw49H9Tzy_q*pPRtAAb?|Mz&~p^7K4j>Xv`A!h1kCNGpE#n->1_l4DL zF!P45BR_=K{N%7|RniwSw6k_C{l zaEnM>t^Td7bR<6;B#rxN!lT$vuKjtU@gJc1@`3Yr%MaEiIRsNY;J&npK?A(nz`S6g1E)T!Kye;Q7wbg;w*3h34{ak-Qc|>POIXEw||Rai@9YuQ;}S5E%EQy?QV6ybSp(k@V2;`;5Mxdv>}6xKtYD-tmNgIM~9dH!Y4r z$}p)P`gu>V0|@Q9vY|p7p-EV_Uz-XH5BFjmjlCfP;!lg9Dows){srpx$9057L}20U z?72GEc3vYk-I0SW zst+Z>{PwE^0z9KLPMO^Lq?4uYKu!`BFp;J+sWNvE3PiX?W%Uq^w;S%4FIlL0@ed00J1||T<7j4Uj*J3hK@^uKKzp`^-qejW35qdnJNsh7u%;BY(G-#zOl4=(Aqjl z%cwdD5&nf+aHDE`ytu&U{Tz@6FiQZVZNG=h$MsfT3=G~__kfappiXGK^tlRlU~R4=F?_NP7fi=)u_;?$`tfb%8{t#DF6C+rlFRc0hh5V8 zOf$34l^GG151m$()xZMxvpv^ZIB70#4`LFQS|xWiGHvScZs4EU}{* zab&nQ6Sn>+!~hx$$q#|(fC^quJ70*u@9rq9rzJ#YSl;={Ot|Wc!s`t9|2!ygTYc${ z?{E1LSVWRiVIE$*mgZD;YwmrEZh^tU3-Of0T3Q}E0|{OihSQeMlx10^sLAcf zzh3AH@SGKN+@*;h#^s(FSZUb+Z0C`|sTnG}u9sHtogCAm*q1r2Q?`zd9>nGgREjId zv)W16<{EW4=_!jp68HqJApG0L=mAC~89GtirAJgv3!SSc^**Jo+=(wq-krq6&bq26 z&S`Sr0+eAXVdXVF!6}!x_Q{^<-)O8w2#Blp4#VT5_jcKyo4c%i=3m_K8hyb(`S#Wk z?1{~nM^6Tkr1Xlb%Z1LIm9jfDQ>J$htbEYabD2j_u0d60hK-e#Y`OAYPq1kvvd#$y z1Jd4CB%of6YSiMs(~rM#VEO6F34MgljL3_Mjl|CW1xLZP0y=Y>zajsAUjxVxCjyI; zDy8Xn09qXL6Q=b43pxWr1!>*2AhC#VpGgBQ);xebdw)EAm7gC=O=B^WACKGqYfgOp z?b9vm?XAS{=;&A$zwQ0GQ(|I0qXFUjpVFZL)9bZnfh7naYk`~09(;VDYxlaRF4{vy zt-IPeBOnN9)^!GYf(;Fr<7z5d{4(sUH>({!LfYCC`;W1)xtXT&0N?&^lc)!Yh`l&W zCGTK>V#AjZpB+Y_56g$@%+MD3w95O`+6*l{E0yw&ZUFkU@!qw#p@ z8LOH1qC%?UhQ-v^ddkv(T`w_Xk%<0O?wdvy)i7-g7aB!qTb1F{#EgPTEeE(cZV51K z1vT|yQ+a0Z)y-O5%Tej@PxPp``g+gZgk_1%>0!Ve;lH`W*+A>5?!kAQ=JpJ-rINzl3aiN`Zcn`qbM6m!^jAj-t*w#40rgnd;3U$Lrsnh zdsG&4(r`a+F4x}2#c!;2`Z;fOG^v%}nVM#$bF;Eg7H@*uNne^f3Qy146BCS)2*URr z%iUFQgaVI`EulhEDJUR$-74|eEo*WOlS4z7zqA3O8{p(1+sTHd%8fOZ-25hN&kdi= zcnOvs4jUBH(`<5TDQlP+K9otdAW9;D##uF@NtI4YO6&rkG0P3nXSDf*;^%vR%Ueu{ z0{!rDVxN=}p%dS$xGks0FO1Soo0%^m>%sP7#7O#4o&H8~nBzQBQ3$c+=cPsbdHTX^1AS*feN@@<`HW5Cf4A^Ewy_jnjKbXNNcWIcg~ za+XIU2PW2-X}h^d|J6Lef_ht6G^eH_MTt)@F80ZK9nFOCd;aNBkHp7uK2q)pf=dT@ zp2Mce>FEk7Jo7dq=Zg_k!5i|Kh$}G!Nn2Xxkm4r@lNy;uQ#Xd_=M2JAHWEu<_ZYs7 znK}?-=Y=a(jZr~$quA3yrpuBTim|iLKqP-e4GQhuYDONh{su$Svxc`C7?Fq1Azc)k zc6?-Boa&rob5-f2NuZ;p0QHv|(Kux<81uM${h8{e=Pui;&1(rDo$h?iMu{C_pchrJ z0TKt~xXJBt(RgtnfH({MZ}#m10`e8DwzJch zf`UHz%oTVc|0Mp$&2H9YM!`)-L-4J&j?A{NKZuYoRATg_*WKsZ4K7%3e0V&OR$&0y z&DZTrwWs~l^4EEn`i;q9KU46h{c&w^WlLj?_t-X=fi)(bH%CaOZgpg=r~SIGZS@8f zPx4}Z%;+?a-G#eMG4=mbBMw&>wf`zW`?{npCYu)d9)t?~TOz(IifS3Szb6U`fGJe| zDrp0;cjSO@^`rRUGXa5%iHVlcPUXS^Agc|~@8AQqx+##7x_NjEx4OCAAF+0HbWD%i zKvS#=2*{^%(9_Y4;qd{owC@)nJqg_m&&#h(#RvW`FEbJ`6JL-`QNGC}iB@W%wfeq} zJ>Gz&ou&C7aSDR-SRE=FkdnTZlsqmKFHS?1`aCqan_*~Rk1dtZ(0W*Te06i_j|L6j zeE`t+yF0`%Y9tDVn>h~djfY5ZpDC3N42=JW3@i2+6q0^PDfQi9?z7804h~UaL7%D`2G0|8Js#lGh1ru@ zeK7OGi+X)NF)5OeuTDcl3ODTgdm&eCIIMC@yi7V@&_oO=AE3S?rY7MXx7nG=-xTRpA!;M zPP^u8l=~Qd<)8XAB?xaa*{u81Myo<+X6ZL!A$et@RprId(~DDBmNx%oHQC5Qd}oBO3k4e7{j z&ZY}(#^>LSlT_aC4VMJ5Ujhw5V(9_)355xCw`2AOhbL}d9W)yaHZi68SFh2b+L(FOtA)6`WL8}R{LN`UZ|3tu=H^vX`1WyR z0^RiSRRdS@Yh7eCJ-h~>s1q(GumlGxOTH_4IttXyU8OH=?>GPKc&2dM1t!~4)`UOP zMv+roJN&rcwa=%wZ@`^^pa9+M2ibG|_9Fvrd~6l9r=x!@mUTda#vY8(M0|jN&KB6S zu;BKi2VobZ8bF@4bcs)k^VG&vuAJ@tnE6(*YDqg$q36MqAz6@z93F6G?HZpSHv9O~ zi0qS8$}#N;EyDgS!9wr;L!^izUbXKXC`!-SRL#bSAW<~+gv+T^XDOQtD8F^8@JzGy zvK&axz98hEZzNOj>4wU5?I@n6_la%o+kr7U;~~n3dWB8x(r}|qY)Ywmdb9WE^z^|Nz{RPk#PDc`w;EXguR8vkz&OX(FbAR zmh;I|1T#dg53oOv%Jebr&ZTZ0)NH9n<_MCsHRb?DZL=)HV_0Aih|$xTiRty9_`9{U zwI-`#EwarYQ3K=%+6A_SKYEs|O*-#B@vdLf`Q9ItgDKWU+VR%#9U5CoUAsmMmm+Gr z2h}rloaoQgd$z3xw8aIzB?RIAR=Z)A6JWvMlkCtvkvmLg`;3FMiZU^cWTx-U+riX@#&KdZ9%exFYIfTa~i z^kVBtyk)-Gfpg#=6Mf42$?LK7v)wl6C@fWq3!702!+^~Z6uG4$&&R!k!`+e{Yol)> zB-CI0V;w3h!$ULf^8)R!jVeNpRtj+ikl@KszL7-0U2}SCLxT6se5S)H1vp|B@=^2C zw`2B68{gKK#wSGl_f;g|>SG3`$-__bnn>S_NTSEZ6?yyE7#YdsNBXmkh%L(^#&;+D zO!~^K#pYlsXF?D%9IFy*$gH^(CQgPj8bPycEvlyxBxUw3IGVs1MBz-4Dy+?Bn^{y@ zwh`x0a~+qGY7`{cmVkcVX>tm9nN(G4ZgoC*N16kzH_3moIHgaaoSRsiLq3KV3H!F){8i(YOTFO{7m3 zeEe2@;o&i`u)W%0_qC_qwU4WwR;7H}tZM$!kA~FG7erZYe)~sS5^H0+RmpG)46B|t z5^IsKUZSZ{4*ztx{ReV|}pkaRMzQ z3eKT_qT`iTp-p2n@aYluncFwndhO$QRcv} zW{}@LCI24rzqI&dIB?}8`~olOWZHehQE;}3#Kz7HR6oPndXXApQc}R=v6FL<&&U=r zt-5R$=VT{$l8)Kts?zxVntWQ#*{LY+sL$+L5Oraq`)VLBFhg~Kv4eq;8teYS*nDj} z1txYzwRCH%H``8`l!1@NkC8d7N#!z^`Xp!FNfOj0mDfMO*2^=`c!0>^%nr)du2}~aKBd5!c8tv5&MQJJjvxft1&(p&E_6Ja9- zF>LLhu+c%VD@;J)j`2@z|NmcWt}6<4H8no3J9XJeOIuLD7h*n->lV8WTVs}rf`X>J zydpD&*T_g+Ik~;X-%i-@`&~Z$^N zR8$P4q_nM19YsWJ0J1_R?+z-OL8TV!8K~VXp7OWv)#pkHB$2mh7f8S&M=6;wct9T( zhns0+Ln}pDJFKxcSc!=C8Z~K6F+nx7qu%K(Xo&yEDwSx>x@E|!jNQr|eun<#0!NRB zy<7U3FWz$n{(?j~=N%6J^X1EKq=-*lR52m6K&CUMi~~DC^%VCUk9Ug5#THcYIS9+o ze5IS)VfHSn0sHpfy#a>zFd>R_OMdNa-tUzSjwD1zYcbf6F5@yO+$ege4a=R5 z5Ai^Z`-yoomA%We?CV{!I@st@8_by5N>2sM9?Xj=G;B4qhFPB!n^tqUaSbG(V=U8k z`bMJ85wr$)Mt2bp>Ovf0kXr4#NCGFS=~jpZ)@SR z!&rBe_$$ZlAiJCxp^~un*fZ3{N3h;Awp}(`dOn4|cLUKs-Q}73d!71w)y|XcOP=Dd zJ0@u`rJax&Ap+_7ls-_MbG&r_^eY2bKM*&bmt0t)e5+*nhpOWtX z#d7?of5)>HEeDEa8944Ny#k{@xSrsT);piSHu}1M7I0jNzRo*V8$9S)f&rj}|1XyF zx9)}mh@>RknJTUP?QwyApRr20hKK%eycYaLP~mOB8$J( zN@I7WzWQP#TS3mp%1+Sf&tWc5Of@-~HC9*u2pew76`|*_h3|B`7?{dcG3XTVcy@IJ z#`UbNrB7xTM`1JbvfnO^B(`P?_@t*a5#883II#1!UO76>3`X~VQCv@+H{3ouBS_0O zNc&mU>T!KJ(Pf8rQCeNAFu}7sfn_HfvuLp5y?Yf#fw}%g%dT)RE|=eWLa_C;dfsZ} z=l8(KzU$!dD|I7|D1+H9>(kabPeslXpC79eg}Q0Dc&DbvH9kG!d|6Y<4-rU=BHNf0 z2LdN$1$cYygtW&B`2Rc~mr71)TrM()pafnE5}jgX=!d*DfQj?ul#%2mOEeA0`lnFJ z{XcsTOycg9?60@_#SE9X-B)rwF|eBrzSdJW8Vrm@WOjfQjIMj5DsGL296V?0<&Kac@MW2pL6<%-@axd$d(qDxCasbOsnjSaW`p660F-jG-7CMnP1oPw`^(eru+YKeMu($GYk4_Zx}KHk zJ^?ZlXwptxO)abKW$5`H38=yW<%IR!X|X^wq<;dC^59&h@Gl;tbK7) z(&RzFJjOj^6OjY?!XbT-Kib_m*KT2`ApVLB9Tx%g42g>o(H#rzk8FF(M#FV4gG`=Q zgof6=Tcv5;-DV>Ca!=Y`54fYqd26)&K%KRSWL^EH&+jP{hwOCSkQf)wS9^P16L2%< zoG@`7KJJ-X`*aZ$5B~}y>R<4SD>m?6m!mr;n*lPU*?IG+{ZK*^e!;TtXh|ztBWAq_ zU2Q^>U5qIo8`2~5fy8oLO2!Lq(gY2Hc~Z%La3#jBqa=sogvEy}LtYjf{etoP_PXK! ztojOK`lyKmgyFapS9e`zNyAGAWIx`_9$>OW-$4m7r%B{7|Ct;a`H&~3-_TIx=-2`+ z91+&s;rq&LPtQOh4qzkOS53fdw{%{&xZl4G#0StsdZnnTi@&=M4p98k!~1H+sVPy+ zNVPUf3v_<(R@h_zG8R0_%4q>XUHW9PcLVQCjY&EZep5o4r?kRvLH{}D@{)Z)u{NFF z=3*@V9?G@#45qvb#%7b(fG=0JpF;0LW;mzq5a-nwd}xvE?NC$u(8rJ|9xM+WJVcp4 zMM)pPVs&YMBD^R%VP!V2^U8r6>M~|pmgN|dMy0%V_a;mZ82G9Q&tWgahAU?EmAJ>w zn-2wP=8=!E%2r8ilbAdlO$at%Lbzo}t$Wh{%V)l5zXr$aqDpemesXP=(&1)%%_4H= z(XZ8yfYstTQ;xeEKHx`ZL&?*6r>jm7HNI+6Xh|Fp0<`^Guz+FaVt^AMT7xZro15LA zHjj8RvOX=kiOVf88|xP&d`(-6Wng~3)W?tKR*R6hcna!DO7IA`2r$c&dyemq2UGbH zjBD%3o*vvt_)Dyujn4hB*`+EPiAl=m8-lcL{c>5%MpC+K`SFy!1o(787`d-6J^6Xx zbajL8PKuHAfhj8TDk>GrmluNR(P$UqkQH^6vU=jTi|h4oC^_z&TVF38Xn0*t_7p8` zcKi;|-<;>K-Rc7)AG{Bk@HpWpFHqodYaf*uUfSXMB_8MrpT1TQH(PnXziVHLu0=rE z-O+Kc)c1L|p+MR~QoOjo?7J6Efm?bM!Ij% z7p#bxeI}z5KH$q-u&PpCeS&!m_$43?-S8qq#ujzm* zWfEqUk}U=Fgk8ELn%(? zn49KlvP9Rm=hfw+?baw)q@qI{-LijZt8}or+}XTr(okSQ49Nr*bOX5N|M!fV=~+Ie z3F-sxiFQCJ*7I z`yu55Mh+pPDP6>SmuskYfwXPqZ|l2DVh z&msBWG1PM2oMp{`!`7_x9wOowc+ByGYY#pFT+&cQ2F)&5%nzH`2OpdKNeQUfw6U$# zeh%!xHK4wPQ(#{spdwZX(C3q}J7m!&_ge5Ue@DF(eYv_l_?`V?HWz&!uCqzBSSXQ_4f*B?CD-+gB~>XfvxNABd~-YB7V$*kUdsUnM6^c z@_#msdp?opL5w5Uf3yy@ss3X+f0>5SgXSWE%&MH?FARQr-8~#LxqIC$qlgPRf0xY- zJfQ!(bjV?iVNyg4$+PERdiLDWgAPI{Q!T4-io}=^H~VrN|14IWc$)+GX$Y3qZ&bm} zFiki>cj>U*emI!KU|O~UdlUmwS2?c8kj(-~gT=j&(5LqsBM{gyeDh1giMk7e3QV566fA0q&-rz@}-E$qPc?>~?jhZ95soijc3Ty2&3P0mH3BHj!@ zFS4XZy`Cd(;^I1zk^TDTT5Y#(BLp%%-bD2{%**wt~JuCGofL)W{b^rgx`c8bz) zipUD5!J!$QM8u#gFuH#D+{HC5IMa?n&8}zKJQH#Y4qw zBm*}WCEGyoKxtS?XWX#t8>PU)0|h!t`g(z_Cj(Jk!04XgRA}*!U*Xtyj)1k4HU;wO zy81}G=MbBV$9hC~qLfMdUf8g0i#6dl+;}}Cm9x^E0C zXy(S2OyXWop)ev~oN@{su`NsVpRf1R3RW!?D)~Dv_cs%b8XG1Bo9;m^v`}^(3gau@ zCK!+rLPm;)S*z>_s%AlQdsWSwxE}(978f}x(@{!u7+Q_py0kYW$>oB&5CB8hTz5{W za5vgVscnA7)hYcv`k1uEY3?o4k#bpdBc~H@)No79wOiAX(I|}WGB()tJyU%tQZsRd zBT^3B$J<|94$Y4UvN@%jvY&riBoGtR;09ejP|-owPpc{gMXsl)-}(!K$fq$71y zJ@OYqJ4G7fSidzSCobyMc}byEe5+Nag5R7ByfKc5!oBu8)fYe->2`A>V2fCQRD>(!+UI-2L7^<~LFP9p<*jBx6e$$&jJ zkQcS|(6od|9`@mx{t(Vtu=@8Mtdno+Zba|M&5ip)ylEc?-SYuC%s3arY1x_sQ34gj zgmrLLf{%z5lPrWhUDr#7S)SCt9XHIwsHd*;JSly}K>z#G)e7F+tBa?&@j+VHHcFW$O`%U9^ zdpVwKMEBBDEDYEptOF+V6}!cujkS{*FCEd_HmG)xX3f<-o=%B)5VvR-=rnOvNV+b& z(AcPuuVrbrMA;gX`~0r+tm?!Sk_@Ytd2KOt7HL1~!m*QjC6sB+i&?E8@_#?RkVtQH zSni_Y9xR2>Fc7ipYI$d56<|ZE4eo1Eg{4x3p82V`2sGY>XL2O=B1kcCsuRaK95(SC zpY=eJJ6Ko6X4DoXbg&7uhZDx@G`^aUGo9TABr5p)x@s{fp$fjaAXORN&V11XlNW${ zrj|rwiW$Xwl0-@DuQFk?-PMsMjSv*qaZKx|i%$X%8SGS7>^veb{LOY08s%wiTd~@2 zjev0#8TP~%Xor^koEO-dM~$h4>~@bZMsWW zt4(edSTbji9CyD6^-#{>{=CWT&e!pDI>MNPtt~E_wEWr|1E+o2KbP2mkm$Up=UOh^ zpN~%+7x%csJyuCs83%nA1(^v)$Q_T95MXgJ#|*M_UNqXQ)-2bG(9)E-^9jHPeR4h9 z!%#QaKt|oh!`x?VBag1Eti(VgMGmu+Ne(V5^l9?(`d#&LefI_M>i`_!(jV-uAO}$( zLsI#zsk1^eTe~IX6o3I`qPa>~_XC2RH_CDf( zMMlG`+c6uhK6l+^M%~ArG|G;s=5!Q}LgY6WgO4#{EP_>1bVbX3UU_ZM^^Yr>0Jl^%2Oh~rQg^<3cXkVn6O0NZWUD$!>w zacfN_;*hlHC+;tCn@7YohQVKX__pwYTRAd7-IZXLh69i7w4}b}Gt5nf@FMbHY3vmq z@MD3}vBg?DN7;|yYlTq@4h<(CFDX0hUvooS#fs)4YD3EIb?*V&0d_Q4h{}MNP}s=n z2tNz{Bb59B9iiK0p)MUHx1h3~M%<9Z!@RTnH*TNh*%}qnx#wZDACDW11w6-+dTw)- zc}{;OeY=d5O!V8kmn)UPb@eKz4Nu3nevrSv6*V=^Y*CQ0seZ+;UoQ43&o7T*Q&S?1w)|nypwPh` zD1>89=Mu_PI^Wx4SA4|hW(Hj`vHlVzC;iR|&j%)8O7`_l=FK_aO`rf2)OqYiZA7|K zEA@wxog~4^GsZRe%S$8nsn1UcSQ^-A_ZgflqUFg0MJdP{2#gL=4m$TFi}GuD#_ND^ z0jIEekG9x4N$Dpb_k>{cTKRTQx>X!eGtxLvAqvwAMg${BPolc){Hczh3&lusZbQKeEZqI(b;4Cevd?bkw z#_Ea-kLdSj{N>>Bmf6N4aKP z-bl!=AE|y!&SI}0A>(zyvL!C?yF`D;hFX+za@&4^-ddMyYhtp>r0Wz}Hes$uXDH8+ zz7lz6tzp$Qi%}31c~tN3{UxiPc04Ec?rmpBiJY*dJI>?>w`xu5iWiRT5?cOM>(c%C zA2jPo3KJK^!gz#%oxVtUUasq;K6-#PFTVfm}AP$bzzg6(eT2UzZv2= z_M2cx@mE_EXFT7hQMh>i^p>j()wJzT9rE8jHp$QqH}zC7}V zh5ZJU*gx_{M@L=K4S|&7pSMn3jD65Zpl!_Ob(i2rAS(`tDAZx$gbz)Jw5GsaUfkEg z^%2^wTq^j7&$!p|-GZWGGLgBFDDif8=gYmAMGUCsFw!?E*z>jCQ6Fmx&%9mM%OA*C zaK$QYyj^LLPd4<-Gv5nt{tmbc@+r9qb2^HgsD@tV=3NR_E8Y+XLU zdsVFQA7nzBN3Kn4t(3n&VWq5Cb;=Hd{TFsg)xC?_R6q~stY>-HR*&+Cn|A5`0_!+~ zS$U{ocGNuS_EfW~I%#^0M7}fGrT?U|_qh`+zlnDYh@}t3fx9s3w6EIJ9G&G!{B0I? zB^3qtyk(~29X)HJto6LWq@05Jz;L>wP8uF1p1C^Y2RtyFg}1cHq5DgaO%hGa2yRHf zaRAzcC2Ju&#mHw`aYLo){JJ%lLU4s+0qXT{6%8ew!pd5`mrZ<_@vYs`7+jB6_w2J5 zn1@5i`4SdsR6p13_6$7rj#wqHN}|NU((xWjc8%z7hV0ozv}VH13V~4CEoIgJ<*~kq z#!6*WgECR*>io3fONU4@%0&oy)IzmZ9Jn=qKs=9-UyktVK#j=TSKoT~#aZld0V{IJ zPgfmnb54;K*%RjuR*zZkMrGyY<6~{y_ze|jUnz@8&d>9^3VrqU@9HdQ0b|40*Y!qg zt${5?Ik|Q)u))bR_WYbpf~vv67$hPtdv1#Tj)+l3D=Rc2BD2p*y`!{MWw4?5|KPGP zUS5nC7>}LrklT7u&>bu8QBm+LT9`t8C&$+Xg*K>YF`C+GZ@j0EMDYI;x)N{PcP=%% z@z~Np||+gW21+MY+pC& zG9X1@jMetO@$sj8O|qjCXX@o_4KnDca;M84JaA1v0{h`E*qnvBGT-NQn0R`S=uAjd zt$`KeGhTjOvKtV%U!BY_U;`!xbCFRA*N3(;C(9P&y2zW3?VF*0UhcxKEXStB zdr6w9&RV7N$zWUoIat*!vgn7IXL0uK}$l>_5H+~^J`ETd%SGaH(-a?zSg%}ZyRP3~bBr@ylFx8aH@>mzzKY9b zvPKV>zVIsCSa_}|nz{viCnf6>)Z+8Bv9Krw7-uS;9`wi;MhTwUe|o4f`RjRFQw;zQ zr$&?Uo9imSaAf>Aw!-9Q<<(D&VdmNHv)pSR9SfXD*`3K?HcjSQ%N`X*%w^AgGzjq6 zVSZkhg&U$Js|@ZUDwB~n-{wp$H!v?>oTLi08)LyMHYKPu@-&Yr$y~S8$}8iHzj@jk z;PjNDq!AsbP-C#c-~%olALo)sHWBB4KA6?nn)1(XY1{fDcx@3sR$!J*(wgts9$?K^ zqYWHsm;ZEYD+&}~OztuI&1;D>9NTt*l{r*WH=PhkSV%3?rL8;cslJfHA>{YHZRrrN zdGWdwoQ}e(rUMtpjwl-#Dc7x~@fjpl{Cy-XR?(R;#`9YO(AZAJLVp5YZXS;NgbH;# zoE{LTniCQF?N3rZmwD2!Q<9==G?|VpjF&Mg)h(v(rky5cSBbfrZ7zvFuI} zuF>Q^13x(>RlOp(p02Ef2N`Z>t^WNM_9|4IGl% zWU1c z4e|&Y1qJlZ3CyvaR_;-wFn@Wg#-960s};8A%{c*tpmn1EG3Hg%D;XudT_( zlb^Qb%G+5?w$&8XLIe@=q}r8wpai@ZA; zJl6-Q2UOj@h##UHtutd|UfurGSy?-QsX{y=TxcmNRhmDuiQ(fZC}gc4bm>q~WZ_0E z)!7Nj9J%H->9jOa+H2huB!K(-x7L!Bl;rZ-r%N9f%3VQ5zO`b~WXq-uy_i`hIPS4; z#_#P2_Itg>B8;(`aT(*fKTiDaMq(noi;rVrUE!oe^kfs1Zf zia`~quyfg{QwASLiNRLoO@zKPMP<+5yL+uBb=WKpOL0wR|5W@o_ZM&kltkS|FM(J}a8a9ZX7}sEtC+^KViI+q?-zFte z8g@a5Att&!_}tby^56#PI!rmvB*1&>IuhrH2lm!I8p8rxSPk@1oIQLq-8;wVs}UkT z0ZkIA4})+Ky_qNG&9-GeX7_(}i~4m`DWl^r3Rjpvp2Q)i9EwUyCBAg^;jnslgo}Wp zi`B%O+}xsOCnmupa(Ag|qW_w2;?0n|KUIHS=!m!or@@RQY*=0`txcc(q}cZUb>;t% z^_5Xoc5S!PNJ)cqcSv_hgHjR#64Kq>At5a(U4kIeDV>5K-L-*DH=AzG@_oNE#`(r^ z=%44$1M6PviaF;sFGfhnH5OBsmRLOJb#zw{QHaq+6BQgz@8@Sjf@_!`6Oy9yngC%y zpOvJ-1pmH{s+;D8-5+@_?+^M~pY=x;%y~!Z0@0L6yn8f()1qdzRbBt`{h6*i)C*${ z6`uiPfaw~A&d5nyo|+Boh2LnQ6sejYLqx}K8S71|zD-qSf%d$F&sLuzaO>75`GWGY z^DsMe!>HLLS(s$zP{g}&c!`q-h#q+V5Y`dm7`UH&@`qcl*Pvc@ni%&CB|71`=?F$klX*~>F{ENTC-1_>+a$iinP6Sl_7Iyr6Zm8Fe5M=xu8m9rwBV2Odk z|8vTbVLP%%RA)C}-h1bm-dssrQ_$`E9MRKRoEQ5A6aJ=d)Bw&0Uf{T)atH zAUq_E@;QevaQh;`MLW_w(NEnKYqKuGTZ}-!E z$E;=K$gOOy&DT>+P0g|q{xlGv`>^o)k#C#$`jx^8MO<{fk&u~CZ56V7y0=FMa`m7( zmH6rreKi`wNyb${8!VuMkJpp07V>@4`v&u%zlhg~`jhQNthribBC&&dpto>DBYigb zb)62K)>Dn0Z-f(@?6(GA43T&i>x827zPVEh|A$4{Y7F*huwqj|k}&?}dp!x3mX>O3 zYrkROp`)Xtp(*IOeQ#huj)ylnH)mySZEa#Q@$tgLJ>bI4r2UQ~$2XZnt`klyAqoG< za+nSCX2svF5p}$9i&u{(dU`|za)=Q2`W5loxvZVanM}fKs#Rw}Ho-$6V&~6X)EbeH zK&6Nu9^v5T&cNscN9amMP7ZdB->i6VY-~D0n&3Kcgv0%mFa=o%iUu0we70^g242Lr zKh=YL7P{*Oll#<|EaWF1G4%8lYlRDh!K#)pAe$EK7w2voBgwL! z-WpT0v9a;;*7fWi?%mtK0(3MT8q7E;BJz23ZAA}=TCnyHC>nt^0Xh-#nU`Z&t;d6B z@A_6@z4|br%;%EPz%Qv=tk9>sBIEq__F|yPlYjAGkw1h~n?_GZ$3Wkfn2Zk$7eCe1 zAcwe((91@IvT}^*=y-w;)7K)zgg-dEFa|%F5oNIayg`pXo}#pGc*mkhG|%sFakH>FH@fG8Nz$AxRaC?d)SK1rG=%|G;HVRTML9d6;crb14 z2VlZ{cB(X*9Zk&Lt3gcr5RW3q_XHFvTS4gK2(0iHs=1M4dT6KdU7$mxk5FlLyE$#D zHWg{9PFO)d`})(8!0n{mvQnyj7o+b^pdhuY=+zt(zFueywef;rd0oPaeCw0nKPFT~ z0OuHa|0~&Z+;FAsZz{aR*Q|Y{?*$TUWCUeAj+TBIcMS~e3}*_&#l-~&2iIDTaxgQ) zea^|rQCC-|xq|$1WmGel5tROWx;d1TnAllURI~}q13QhXKB%kb|NMz@*-%+|yw(#T zg%;8CQUYwADboS7FJO8NSd@R|%n?Fdn&g$`k?EM22D=#N=lLoKN6okdwNook<#oQb^Mu=^BXKlM1Ws%clU);2J5^txlI2C zBM{wP&Bn+2cbWnD()Y`Q(ibY^*4C9+gGvDdy{V{&Eykxhu zL>Qv#<<;~A6c3;H;{qS=HvtA%VYtliw!=hqa&*Mw>Z<(i9VabndF|CZl#^Vjn9q)i z5ffxtvz3-;7$bu}4wC3)%Jua}vrme;A>hS~cn%F8%WN2~XGja!8UnxipiH3xI{YBO z;Ma!rOSotir}39#713Hu5ONeytu5{{K$oq^GwEeJXSf7Q@y=0L>dJHNLtf{f?@oTN zsW~0^Ld#?E_Zjod46r~Er~9Jth5?W9TfWpQBiQZ5ey~Mom~;dd95o8Ml9G~}@bcK$ z$>va+vDCYFcnb9m4Ib|9?qG^FH~C)cIS)x8D;^7^ps}EgeSc_@>oU_YaO5f&rkvyV^BL%1iyi=-1fmw-(OFy)Du8v{@&KT|<$3l4DxlXv0kL6u>XkTV zM_+;Z1ft!4kKfK;w_30ozpFr_fB!{t1oXidEX$toXb$tz856pSDv`m+=-L#rN-&RX zGSZoC?9F_`fFOp2ids@u#v>pQ`1B~I5FZcBR^LL!0`9$5M?vciPG2(f*M|y)ktb=; zJd-&rHNk1PxVXf`#8~kzgkzF|#}JN6>GL!nYVs&4EZ5pXvx4Wy8`|}X3+Cod-tJ)V zHC}afe^!^7nXz(Z$7W_5 zkamB!{+0z2ywSVQT+7{j zdV0dlSxe|8ra6-DJ6hR!4(z#mUubM;NeM6CMfW0U4<^={Y4X=m56b zXSb0yV|2W3&24}1I$w~u(tsqZ*$}f4c3aSrVTJsr5+oyxPr_%!JY2D@DKA=l^^OK^ zWnN@8(@(WkeO21Zl!r95D@(W@6mCD-+tsC{d`7ZjHCftpILikHTt{;$XJ@o|_;a9Z zYgJ}{3TiD?Rnl=2b16OVuj{2FIE>m`-8+MFz^;<0C>980h=tV_VA|Avi66Ql5d9BX zAldM%zjUyNg=o+6{DP@0AZ#pn!~nfiu;b^ejX%J^S&Vg%$LDorbK+{^LiuH^7bN7w|B1u(O*QABUs& z_3_CObcQ!$;^4^Pwbn2(F-h_QE)c*9Ma}cN#5?o)Gt_^3zN~{LzU}TSC-qwj0l%us z+P6kx?AA5!&%Rv7VpynqZ{FOgzUYN8vlk5vDwdy+5b;8AB5X&0QgJ_0eHlFSSru*i{;|L=Q6#RgADa(ty%uEf_09o|XP zW0J?Sl@TE!F0!)B(S7PVIylz+5k1~Fr-iMpbqx)AwLiE(S~6x85&{$M7YcMP0>sM? z){N3HLZB27`u8eP|DRWB8Vp^Am!N^r;ey#@e=72Uvrv>5l4t{42M1uLJT_L!LX_f# zHqPf>P$r(AyCp#AAfP2=VPU~Q?wOh4r=ycGGBT2uF0A#GBU-imd=<;c*&k;2aY_S) z#vO>RZySLQ#IcgGtsA=~lRpkf-&u9W{cH!y{ydP~W9?$hBW;53P z87>vt3Xiai0B=F95b8wGVO{PN%z7U|II_OXt4BI*x99jx*W{f6+1i2wpr!s{WB;^$ zMDT#w1HVWLAlP@`1Cf!WtZZ0Cg@dW-dN2ZtjIfy3$*SdOPAs{|C8)j%L<{9&n_ai# zVq*S+&z_u|oYA}3*N61gt2;`;y zM@I4A`&@Hl-Z#;!ff(u|uCgM71GsU_{~}i{NPByGV3H~=H@CxP`o|Yq`HubwFKYE8@4-=C3|9c(4;dSz?#d21a+!2pzh!ZGQW(`D}l zBdy%BLW{rnEz(f@(?dA}BjSDzPS7;%66bk6k!AF@R9E9Ld0Gm5_l2!P#aw%WX}KDG z_ia}e>-A(S#LhkqFFL!8{vno@4z-LBh z`gOi$ZNpc& zyd#@3c0cG!WSlKg11I(0Skld4mr2fo@Q!-@%<8wmz?sh z>zDQUt;I7R*TU0|T^kX-*)t9%;&2T&@K+Q^$sETUBWmtP7kCQfhe=5&YUjzux^lG+ zJf-rn>#)_1J0{h=Lpxsz%HKhHs zb?Jw-FCkqg89>Yyr_8)&7_}r2D2(qoP+2?_d9_UQWBKOIr%$pwS>GppFtu;nj#!SD ze(?wj60>RQ=nM^j*3h+?XobLWJ*)*nl*UOWX1ArlV_(F%k}d12;vnMkG3Fs*E>=0? zyQQUNuBdlnpNWv5AOU3`(l{o$~W!_Y&$TVp_8dvr@b@lJm_meH)O-oJNrym)d$ zF0B6XL;LO9+IhQ=R0!fcXAhw$oC9WD`uaSeWC@HgybJ9jAR;O$6Z!A+&6|a&N~nD4Lx1N@pWQEiFx|IKCSszb40@a2i}AX&45Mt7rnFHk3Mh|O zkhfBW*vJT_11pVObe*m)mW{jmTa+jOvOO1l`@rA&*)Do z&By(nEF!$~Sb~6%)O#5a8bF`l6%rf_N3xtlDM)=-{L8B)<5Hab049r^IUSK>Ne~k4 zi^N>hR^0{VlJ@=K@AE4Dj|r(a``};Z)<({ynQzA(QJYQ&hlj^29h|JJnTd%s8JHt~ z|FUs%j{fVP8ZdTEt1AmZos#ka}8Dj*1u}3~eUpkb%=J`A2)rc`>rUO1%OZ1F5DEor2(e=~EkPy}i|IrCg41-N; z!Q(sTKOM~^9fTO)Ca7ZXcPW+|f6J~ZJG)#~{gaD|E+1j&*6FnyAt_j!+`w}|Y7Wsi z;Np77zF6X8LE}DFIi!ll>0tWR)a2C}1P4m{B!;)bo<~bH=0p7U3ub0!=LhpVG+(aW zl|^YUbDF()>~cIyDn-1~6AiL*^kaGgBs58}LPyvu=RNnCL;IxiCF8v?F;Zg$xdyxk z+%l#1Tr>_BdCKt|tFNyYi8vuCiHRp?u9K7RfDK4q$CIGJvfCqMG^dB z6_;Ougnx|1KvOSzFN>53gpgf%;<{Kt5Y>4@?>`*+STz&V`C!<)(sHrlM6Zp{V@aZi9G)P zqpixa&&Ds8%rmF6reS3@rt_L40siccP|xD`k{UVt%>ZzUy@`!nQ7;$&cU7ziSKuP#S3>N!MOz`nr?P>jC&E?ln5qx_#{ z{DVT|XuCgAP?V+F2gH|haapw=d&saNTI?79BqvYuSXQ*TZtp_-a-dL?qa)gb!)+?m z+nsT(t*wVJ=vz>1?^hUD0@T}>dmagio`Yk?c6P+U!}ehmj3F@o-@Y3hqa+m#825R# zuQ4=>+M9Xh79tKZdt)eOuP-h3u^kNs3SHdQv^+_uoIebY;d+i|J`0&aKm1b}$6up; zKA%mDE60=$x7`VVJ4L4g)n9{gSBT@L-^uCni7qSh<^Bbm4f#n_)tTpv(XD)qJCyy} z`q_;Rj})zx(H`kbL^6jg@0GUN-wKiS6(ysfBN_)~U2pH?wEJAM`&xTDxM3?2fdS!4 zMS6N8&wT8xN_@B8eELc}KIgUYH{HQ?3z3kpeR|4GNlDn}Xem1;=3ue$DK18|#(lqZ zY3Y67Q!ElvQKMt-YruJaadSJ@)Jzr&@FU7KqCZIa_T5s=?x1IX*7L=Jk&W@7w1sqo>gW{2B5 z2x}Mlapi1?vOX`#^af-VVRY+s?n6cEr#Yx|1I!c8VOcc>4KqW?Vxw^ow3GY-0x0M* zG+fNg*5J9W4Bk{%efxeVN??A|s_c96ereCOan|5E;>|7a-G9A_e7F&s8e$P*nut#F zXDx(4^4G%wm>{rjH2rsETwqeDxY!URQ^AIsn)^h0!{+8&)NVJA9Nz~Y=}n~^mJrPjmJ>1yo-(f4vsyS#}YVSa~K-d{wT9uUst74zzk6k z6a?Es#GOruLZ;C66VFdGjCAAy3@e9?rdBQ$TKpz^ugSRUr$uPzlQEEw{)>bEIz~(Y zL!@e=vFA78M9Yc#C@@Doa7;#f2v1|y?ew~Fi3SW-!L4r0%n#ob{tJZQMgK7a@sfaG z+^UXRI$S|X$wiRN$@|mIr8WFP4+tAJR#pQ;Pj#R3+GPO=AS){ikco^HQY6gx4R-0t z$x~4TtnF5FWjd7rJJ;0G>R{TQueAgnXLr?vL{fKm_rii6Eo~tvvXBrFvjm-AzdO6r znYg*RdHM2XO-)UBc(@t&FY=3a){49-K9e!_`*0R^xzl;PY+b#u9+S=svFyHatyE{S zG1r`=u+)NsDZVe3CPx&DRmRPxLzb)@X(FJypPJHSXJ;Q%(br!DKS)$`BbgF)S<{=q z4|z3-evg~}UO>fSHaW-U#u_kBzgkr~Pd`aBsRYiz z%Td|zR}bquZDY~EWnK*FMLoHS%zX}`F^Ds&766C`8*l)G0u1}C01nX8-CeAb{iCw- zsr+7(WxBJ|cqo;9viR-Y^xem^tzkfcJb^hvil?VZLNC10Y!OCU+V$1dF;Sj~9yrWQ zA=fRU*Kw#@rCkThEwd{tPuu*C0OWuE{5hv#E9jQR-S?7nbB$~nbP!@!s!jWcz;C(I z;b?AtG3l9c@gsD!7Hys^hb^7g97G0nLa}M_VMM~Z^|_;W)tMpwsc&5Eofoyrv}I*v z&JP!_PS$!nJUmw03>zJmXYbz0fI)y%an{?5aYeD_(sGHN_7P6^I6-a06;z0dWs>1z zA#yy)*cpllB#H~t+aI5+l2P|YPEJEn(POSkijYw5^3rvsRmjn5uD`n*A*>s~VQ#CP z%+u5E9v)k1oGZYKhZ3i{so+&xqe1M4mWCY>YtmV5)*TU_NupdMsGRplYGykMEGB!{?UFuRYf=X zeMnzHZtVU?wilG6_q`FgQPWZWQ48wqkNVL+@q#Y>-`HXj?BvAC$|^EC8es^~ zaz{r;;5bnxit_V=`%GS9AUB0uaHj)awx%XzKF5kTKjKF{aUokc4(AWH}%QF(+w7DewH_u=2{Z?$jXu0uyDB5Q~azP&$!!IDRNgIhNYwsR2uptK9IbU z%pKZPXX0yaw&^4wB`FCD+eAhtv9K^Klz&0VZx>Wh@H0Q3qsrN7&&tlu>p{3IKX|62 zBlthK%=73za0Ym9hcU&USzA*Q;D4*HpU!xDK0Nf74OGvQlao91@ASXU@c|a}e{_fc zV2pni>|%{rcFl<*Ggh<`1$y7z(=!-Fjfn*Iaq`s_~KP*(Bg4_4g0Q| z3&8rOz$bA9A;0SrNSm+s>mVq+C4&CN-n6oml$5F}r?_~IQc;dXAY^zLGx^!lHDIPX z3xZWBMZK`67EKln* z8X6~F_xaKhojHD|8;aJ}yA=j4okzGSrdC#{x2LSVzQ?hQg>QXr2OFJ^ALKMLhM;e6 zNd`;h1)`or(DXDk5{ruH-+$#cN1MxyJRypb3PmIb)9N6010|lSZY7gLqUG}C*0rG@J zCWsx{6=Y6z25|W{znjx&B2I7t5V=ELSWIlI6F*!BW>b#D8vZt8D0OyGW6|w(zr2dLGXhM0QTJ7IdKO zNWCIf1zN{(ClMg(UWc>^3W;&=F& zUri&S6JFokd^V|tbAJIUg%>YSih}QPaBw1DeYlKy$pZ*Fbi`3GKQc8n)!&bzDMg3d zL6C%z9C*H?p0`tw_HcKFVvnE&tTgTCnw)8w-~eJJ7sZW1!aFcI>H2tov$eG)c_(`T zFC)+>rP`(Ke|L$L5~AAOA5Y0?(5z`-ke!orw6&$J&l)8k`>YLx5L4-~Sqxln-8LV0 zZSB;Zf5E0F;zh6lFOlPzdEP?-+({lB_(2PD2~JN>CDWv?WVs;1vaN9Kq*!FJ?c%02 zlV6G?c%fpKurnC7h+7H)#gS1V8|!jco7ER^Ur>;c-2ct^Z==H+^xB`aq?qAC*9x15 zrLRKZ$o+UvE35HEDJa9X9APf$4m>5bi?jK8uijIptvSpPx@n^)?R^~Exs}UsysiNG zK0XYJc>~{-^(Oaeg*u5x3g$o$J|1G!RV6}{=l3q6Ae*7N3GlJB+(wphS#UxvoV+h> z5+{0%1PP>EVHVZz0ho^*XJcv#0%JjOF>i?5pwJ5+yK>RWAlUQ z({+RWVi0m@haEquA2WO!KNlC*t5?qg0ncXCL9wBzs0h2bAP{RZeE%NQ+g;#|1E>q+ z6T)webNp)LRg)m)z_lQAqF|AsE+BJkg3UgGkgAUe@1OB=;M%FnI2gA|wgP(n373XN zUP|gC*tpbLvAMCa1p=aR=QD<+&;9G0o158LSx3t)0uD=pV^U)-C(q&LJIz->cIj{! z1)*HBHdr*;uz?BES&_!BzqPrE#QiBOjGlo3(&UT~-3M=g;>%4-i@=M#D+&uZ-Rp_G z!J!l7cNpSkE%f-!Q|9)<#u2o^TWTiploK!0PG^7nlB=-BCQ*t`)D6 zmxJ{pLwCvUQ{Y#!=ssS`D=cki!KD$1G&$mqudeQNmQZFpw&T#|9RxoWuZp;iR{`cDp+u?>Te*rfy zuTT<+mX4 zJ@Cm;a| z3$f)_MP#ptkdQ9^G^n|}fX;yiMz{>aLPLYcY;oJgrHLsfgnzpI`=@Jo?uXA zy!FlgOmwl?`-92eHNoPX+wbuzZ*i55aotK#R^xnh;v1)>E3j)C?V6KWn-uKEXdQJP zpUX8MVgLIXmB&d!-U@Gdu-9?XBcaa(uyCogQJ_to$NECVxz^6{YR z4Ux&Klq@H}#}^e9m0S;g=IiSV_=W_p%LCm6aeycT2a}K=AiKq*QWACe-Ef!aoxf>H zO(k?POtoCsnaNC+-V zP0oPtENG^orIo6VctQDl9oBimsFDQ$2u$_l8WnuU&J>im-X3E6{%@+9sG5%OW#v1dI6F8%#`_$CioSYwvkJ_Sx*F;ZzW?~~`1pX5r^?57;U?TO zH8s|4BFwZmmY)JFw+N9G00U83Qj(HDCFYa7bx7{OXLyk((tfTz#0@ z0115e4G(HQ;TEiTDR%O_Jo@0{gROfGfONS73gtB*jf4Lce{YdE7JdysW7$7Ge({q4 z)t*%V2LtwGE~O2g%+<~~?Rd6_%r#y0&z}dN0@a(xpgLiqUSWBf9bR8n!s>%utoTK$ z7RFaoHoI4(wKtyzZd<;ZCspnLDdY#2&%@X>mxGCpD<=PNH--jdX!Smu8MYtq3>K@c ziE?+-{PaWAy%)8%P=0o9?k`|fC`AYt%4WUX-Ra%k!+;VemiQwH5s_liOgu3T>Id>z zIUTL(qT-yI`t0nC+mD^k{JC}gAgw@Mn169Gb#fA9KIA7csu7N}Of4Xg3t(E$jR7CT zU2(1n&?5rNC@}0905rypsLIQ=FPityOh-05}z%vq;r1g1$6fsa7a1&RdF@MX{S^LXMC9g3^#DXD9u zp1IWJteM#6Rm41cHobNlm{KF3sRsN$y~+e?=ko}E$~Mq=J<2tYDnkI zjH<5g8pv5+zU+G(3a5Q>UK`HL`a+8UZV4lO+b>^F)zKW`-@hqFMx<$Ix^u+a06jq~ z_V72BR1wTHUmY!9FsddpsC)qY{yfl#JjJ)2J+q~;MmVwXoJ4K=5rkSYhgD5wavWvS zgJt5#R1g$GuGt$$M$ z?U{u|J16zpMNh<2q|9sKw(-@j%jV`iYU*HA4LVQ3rP1Tupr9(9N)-UM2z)(q02SIO zfUf&UHSU6;2B!DpD)_Qr60;o-f1yWuLOpcI${mkYst*JMSM23?7}7rH6#6hXvth#H zC`Ktn5mUcReB^hkpXo2z3QPt7DtMY`i12EYTS;iIq&Y`Ua!T)g7qBK(YBfP#;Y z6iBRydzzn5fD)#JH1RMUpPXmxC;oJ`wW9=KsQLn;6tBaQ*QuF2P&vA~y6T#Fg`Gg9 zfni|2j!B{n8J4tcUrma&WAqHNkKP{1UMsGLoZ-jUr~QktQ5W=3(^;v1KKb-Y{8 zcbe9LBbB|RLn9-HQ`RnN9QG~lJTK5&0PWDUJIP3cYU6l}OXzViQIyTkFa4>zmBXM| zNGKLayyOfa^7e;tjSh#4wcAs7SEX23(mg$u7so4ov$K|aQ$pU}vP2wYsTmEY3Doc0 z+<3o14c-Sdjpp*`HBz|lj14L#1<9VUTo~>Nt`Hq_Py|bJt zy@oN>7!6HSx!=4DwsJZaVocw&{xQ0(0llZnPE3V4(=J@T;OH>VIa6cR2f zRi8wTTafeeOuk4iew{6$KO555J7O$=1Xl3Dg;71XcdvgS)%M)CgB=dpuGh4#*J6vq zoLbY}3kP+d;X6L(mSAT7#ftm6{f?F_{3FKizz%N(Umv8WMg}W+mtTGyI(bD!O4wR< zGNJoO{mvwqD15+v1s5$0bMI~(WRqFSv9`NTJjOWIW;Rs8mo@0h6%szT_v_oP&vR?8 zKZ)bLyiIeaWsjB8J{-m%oh+&MPt-u#^5VHXmT34WJ+|>j8tCG?6d~VL``i7u!^3a& z^q;0HmanT#QLgVecjufFK}<-bj}sS1sjuf7$wv3OzrHC^KjP<~AtWpZ;^djiTnX>f z!SnN_#l{{pG053=CF=8Er=t=O)87~ga#Hf>=+T&%5k6jT4B8fuqyn;AngU?{FpoJL zLJ|T>=+>)Wo+z?uRpCu{du?qi&J~i8e}-4FalRo(JO7<-w^f#wC90AGai^5~QPowV z<|Z#}OfH+Nv0NLI3@9{WVKhSKRfX^3ImqpNJc&?0MRBO`1$|5=;}6`=_WXhIx6l*) z2c46nzvJ>u@-tTTYGudvPrn1nR=Nfnv}kim;O?WIBD$Op`WBOWzK?_<+mX$kiG2Ca z-)mX!pUA$E#I>&Xes}sE%mQ7e5)FwrnTD#*o{v~*YA?oQ^O$mVulYS&0CO6PA_bd- z#5Pde0c^T6>eq*y=H}BpsTpE9&gd-a)JLutlvwfcfxle0k<}nI_CwOP$(7mx%o>9q z6xbR+3dIsQv+CBq`$c9)vkuPghy?yEKk^QdCA8L ziMKg~UWr$+US$X{L(xAAKT8YH|MQ+S(^bAF=9($1WIl!u3WR4)b>ifM2gbS5PAcrW zw$OL>J?>@2PK_pMDH475vggi^GSM?vEjQgg(iV<>wj~#{mdv)r6N)GfWwABbgkh&M-|7od!_wT9 z>g~}az+j|u<`$RE|DAL-(41PIn;ZJz0F?28B0{l~vyQ1>V9v7@CsR{)`im5J40B*s zYPBb#uJ|o!Sh(T%ga#m_!i$TGHD%Q(;%A#Lh(%i37aJ>)Vk`Xbu-Nnoug|T$CW~(| z86I8B@s5-Kwzfm==08x!<-_CT7Y;_v3mPe>6z#ZZ0jJ=jqYAnks1Ec2Pp$g>*pKP2 zebJ0ULctXC#Rp;H{S}+F*V!$OS;}cUMN0) zojvOC0*qQ=OWicfa7my7Z3q4<<(ryHYez&kZZWuXZHnmbJ_MI)qrG>o4j=uLY#eYb zW#{3@eShFS~G+;QR!>ygoXcL4qPLNGLTX_)EIpIhJm;wMi(Z@u#qC zch|GPo6~RM$ZFM8{#_zqwE!ANMlRT4q5MZy?Q412J_$cB^GSx4)zvda#gDSG z^gy{qCfa1X(w_bO?O9ypoX1l@K}7!u68dk@V=c^kM>k$A(JOB$_=g9q|8ASF^{+5m znQZ;_JR||2yn1>Mv@{JbNM}?RROhfu?4j(Jf-c@yn^yF~1iW9m`ZRji>@)1v`3t@m zplef`L5J66(Zgt}n`|Zn=wPZ)s38#e?e5#+$RHg2tUxZ&+-vbHJzRJj*-Y*(k?X>7 zX-q7da7W|<4jTDLssOH>S@-gqn8p?Nn$L353t;LLu)wZ_> zu5?g-tEj+=qQ*YCxUj!}SSBYs!2YCP>Zxg2Y5;vWf47z39(eW%G46r&IXuqCZ-s>Q z0XfKJ!lRuMGRDSv(b3Tb%5XRF@-L2E`7+`zF^}eFN|HW5Za~Om-T4R%N=T5R8b#R% zqkC>Hm=Yr+Bh?G#n47B-7Jx2EFWN{W?ylzzi^^y6!v#`&x5qz8*e7f8 z#6ar=>F&2I)oir}Kwi$Pk9|eTaDflgSY(_60`F8EPo~RSkTG_i`?eBvhlJupIP{h> za}A~i?T0bqd&lh6`z$?^O*@(eFb$hwt5;uF7!8hYkB+cPirEY5!2(+iY&1Dp=D7~$ zZ**tf&l9n6+`rx47`(z@9bjJSV4=HQ1E#SbkLlrf84Nm!8eUBqa2L#GNolY4{-{PR ziTZbd_!ypK6O#>%1o2PVoqfdNsmU%`epjITwctGfvQMBa(L)DjK#C)O$y*M0?l^B= zhwMx*2>N}N&faM&VFfqQBY*?44=OikZJ!^F&+DNIwbn98q&~0=Xkyub0ip+F*25X| z6^3Gq)9!MbQ=!lCQu})mQBZO_0>t=*g@5|{wzT_;fI%HBOtJOVs=dJ!13)mE)q0bV zu(WvQ-rpS$Twi0Og>m~e*_LVlUTp9lgqJ$=TJkt#@YtQOczPtzWC2Ff`ALk9zkeeu z%QpxlP`B4Bw61GR$X!!#Ku@d$!%d0DXq>8_Nk4;@e;;YyUN0wX#q;Fb7Os%gfn;loo<&2;7iS$7Huglay7gN-93d@dXr?r=l zQh4Z1w}`rWo6l81QqBVn-+ph8Szfw-ye+!{DW35yNwfRD zk3i#Q8iyEQw)sUxt)RUGA3u`*jwKZgXx)CeQ%p^@FV(c$9-+4!g}}~4fJ*))O9~i) z46n5;#Kn!<-P==`&hWYOIKPGs5n~8jCA<8(M?lezCE;u8a`U9ES88pPhX|41g#@n` ziAuR>bI3_`)teN2Fa7&%lQ8X87^C`dNpIZz9*G;)o=U{95gp)GWGX0EwCNEn z&PNe}qNyuukWQtLNQLN!lO&4*8PPDWKAV3XtX5VnXU&B|j1VfzImkg6g~~35No^El z%yeQcZp5^K&UZ1jb$&WErlIdTM}VW;Q^-@Ijx+w*Gv{48FYr6%J0I~%!+Dn$pOpN= zVM?+CNu`fR++t)0`WWclx_uzzdZOp{oiN;4pL8(h8588Rvt%oQ1z7mb0Qk72b##!l z?lPhH$Ep3XE}NW|$%s?{5KP)^Z$ls2BXc#niLe!}XN8`0{MRdD|HoQ;?Q#il8Z&T9xXxda~^IL38uU*n5rEVD~iq)B}S6 zLB_&o1FkxmBhut|6&X$L3I509uE@vwFEx^lxLso(!qXp-*)z<(0n%5f|9AC@%-$MG}-MfJ7o0Z83P;1TGLb<9bE+S(Oc6o zVkKTMnzcgjfByW+?A0!vuE1XI%7#2xR;M=MzX|EqI5Nofw#T@@H~5?J48$uyZ)zcf zqM}IrbEDZgDbkluS`Te?ModUuOask$0&mZG8Ly70XrAs>k4z9Nv#_T>{tNCBM8`F5 zGst;1`Fd^@3=3l%eX z)c}9BpLRmy#&TcN3e9b7EUf3{*P!g9Mc1>Y4HQT-rQn%zdq_xXRYFb|}$wavbmIdSaI=Vt;mzD`0vDT`DDx}-& zzhe?NI(hB2xZQc6^XKRVT+ESsQ2TC?+;#YOj)Voz3jEC6(@EB8T_4o*kYaw9{+wTZ zQrnQU2A}M;haj5L8lmg5e8nMm-uj2!KdlqtJi*Wi=4y7QxAS@nU!;)Z z|2>NtoMVd36UjdDImw&TS4@!l$V0_>WeT$b!P~hC@BU^^h%EeuH_q!j`%t8}*cd5q zE*bg)KSU@Zs&^+eke~t+5)Cl2_rwbMf${UJvgk{rLn!;7W?rYf6wTI-7n}Q0(3V_M zED|N+!R;q)i5%WY9lk&xcO>LckDF6zl!!m_@vmVO$N|g{$&}@AZ}hNF<{g*I5$;D8 zri!>Sr?v5q(J#IQ4(tillr6@AJra)9v1x}>kLyI3sw&s73d5z~PTY?B!03?9CI?MIllI+pv{^I!w5K-@-tx@aZ#kp!L-0ZqPI`h=E|#7&m3)$X7V* zcPFCgc%dRYyE_Y!%+Yo-Cg0Qjdj8gF;b7lHha&+wEA!3yrm|r)VRn&gU)i%cc0>Mw zRtG=M>DW{NV=eGcwYEnI2#%L%E;+)2)8Z4;5J!4St!$LEGe?GXVwSCVic~W>%7a9G zkFr;6lj~idQp$xh1Rc|>Cuw)aFM6fN7fmXAQG6&oR=EBTTW1+oWfW~|X{15Al$1uK zq#NljNd@T=sZC2aNViCLcXyYR^rk_&Ti`A{=bpIF^@l%I zmu-7&ixajd4?g>A?L~W`sI!BGGUP^F8@b!8%SVgd+1T%@nS^hr6he6~I1L)z1QkJ- zboE(J6L1`!f)1Ekl0&WVtAgnsIb6%`%-{d&$?uUpopn9@xFRJEI}9AnD8O}ap922u zUs|Q+D301=h>SN*z*q5XBUwiQ8g0(QznUG>$J=8Dd_2}F8X7n#&q?2hg`IO-YyZs1 zQ08jU&s{5yW2}QVya4WMdU$#U9Tr~|u7sw)1@&_4n|JqSeGwAS#tX=mjr7Me#2si2 zO?@)|g+_Bu$8^2b;i~-W7e{SM%If5trsW~Ig9S?ji+{KO5L%?qI|}YVL-7LfVc-a! zKgV;#SLh;Je4pZM%ea9`b*UG^OX3^UBM{)6)M_F6eJnV0UzDKjLn;o77Bj`}jzZ*0 z2Zl#4ALi@lIHZ3jQ>w-E`}PyUY%Hf`uHCXkcR`B9koOuh=708K{cl*0Riv1ky*-{t291GiY>Y*l15&f3o02hkmxb*AN(x3%N`zhx-}e=A)VH7P`Kc=8 zs=+Y$uzT?b@x4El%?YtQGlc1ijJ~-s(z8R_Id3vi+I(Gos3WiLh6~6Do(w|&^}U2E zNA{4Y{k@(cGtHL5;cRi$g9oo0H@LgyRooQhP-07Ko6#32?V0md2!RHFus6f|i{Ye6 z7M2BQTV(e5R7bM&VRwDQLHnB!;aBJ9sa)n0;)-LA;1_$kbpia&bzf^NlmIR)EcyOy ztN;#)(u_k+UHxWbK>p#OkJZ5QqYd;eE$z&bbFkO7q zMMX?3W{j1SgOtm$@ZfieK9H>wbC0f{ZHMwX`AJF$G&mp4{>rEVK`56;%g122$5>?k z{COV}6UZ=Mw4PzXy)M8JG)WQJC{&9;+n?AChwrDHW-I*J-YQry9A9r#+?(&Eyk=IvEmk9 zVdX;$Un+2dAZHAm=YL@YE+aA`G|7qXca{RMWZofToeXU8t zjmT}0ZERu!VBzHOA~R*ncvxTfSy(=wZFPa*Dj`<~YU+(OZ#ZnlA5z_4&&}$7m&m(& z2n=?o06%P08sEP-j#7Nkn%1Gz#HL(xE_Fc3r(+JOBcGikEkdqCBunx{&b2G z1Dum(m@5<1xlG@goR4to(d2+xcp>u@-)r^*c8Ixc_vR1ZfG<4LRAK1i%_1-MwyI(` zhMW1uKv1O=>GD#hEuU(PWb*yvTLf!niG)&{Gy4-LR(j2)bnSP=-Z0Bg2t7?GY9h@s zG0U~KwqAFao{UB0R!(4zWVb!)->-{^ zoJPG;czC|;?JXiA+iUA$-;%-zS1mq`TP!uwAX5%G{sR>`Ql_kvsk>@ZV?_zY>CUxC z>t1RJGW@)D+xu(y91GDqNM}%WQiWzP40ci8j+paIOvA|)BV2*~?W8FkW0a_}SpK(s z4*mm=03$bof6FI>jSx}0a3q-nAA5P=vvh`%luvW|IS%lnae)o0H~(l&pufNRX32Sg z1KuU|+@;jIY`9u+L98<{jA)GBe@&0)p#hIuV8yTUt*=PW4J>vF>nykPIVkNzSTxg+ zLax$td_W-8SCetdHs>Qs4$hkPWW6SZnQ`TX+L4RvlcJ7}Z6Wt}ZB}!mp!x$&f%cA$ zhnut5{Cv29f%LnJlse;~OuO~l<+k+M2K&7#)1&ht#EcvcSq_gY z^;2Yg4|5e%=OZcXZ|aku8u5;L&0aXux`+W;(HIsVheTnoc@(;kvFT}h=vWSF@T1;E zgknegWa93y7FXgf(n)qGp;^)B_$af5Fd+#be4-nNTg1p91d~b)JN;lQy4Yxx_2Jf` z4#C&4(@Fdh!ehd2Jt7H`l8uy~?jvguvyv0Q?UZ~#>1-O+bU5`mD`V^LUr|X-zIRik zDkR_5^h73P5nhJ+qrc^uo;QBkq^A<|t`< zWlkD?^LJ}{-K!C^kavfA#GxXrn&rV*Gh~mEsX#s)b&1A;BsA`t++r#gv)(V zL^EZ3k~?u63%uU~kb-VQhkeV^;@(Ct!CIr2_9MN?N9i8@cPbyDE%FmB+wyY6Eg`bl zC;obY4l0u>5=zwE0zwa$su{u`ZI*iL^(ON$4T7aBv&+@`@oEnzzTRjVq=t+vtjxB$ z=>!j}R-DJOk~;}H*$1H6?e{TnmZWc$m1fnROubc*z8~8Gz^bR4Q2UC-_orPRrIMN6 zbjMeYRLpQc*?FQsH%nLY9{km`quDt;qWMA&jcO8`mBnKT>g5+1m&pW0QXW$*3d%UO#5>KGEnch2 zvu6USI2dBVcFar+{oD1AIgW+e4AjbFNY271EJ*)Zzi!PgbfT4FRpW-Kn;XebS*!#p zH8b{KX7AHrzGWlz_&n2`<)nT~zSjVWB$LZ{m21sJhihkJ;_9g%A;Lb%$>-a2zC(|r zrdU7{3$cgzMCn8eft$>n6I6SbUC zeSxi>MI2=B}LD3>U}Zm z1nRMx?01%%Y;Ab=Nghrd=ObkG-CLzpHJ6P0Ega>iYx)*LztV_}s84-hd~MCb%!J!5AjeW(b*2>vg zjTiyoQ9jo+oApA&*=w=0<0E{+C-GVC`3Ou{B>Wv-2Xe+|a5o_?%$7 zlBOqimVS1VeAyp;@hcuVS;)rqarEEtEMcvMg+k|9x0}^H-cia94P0joHe$e+V68q4 zJ%^$*g0wU~XTaq8lkU|lL8(l3!@v95!lIanz4?6@q5b|Wn?%B<5IFsf2$NIP>mkP$ z>%Rkf55jPc+6ddeKprault?d}X7TZ`58S?fz|<-FTP!;B3EGu=4p)}z)t@}|d*G_O z618Jya@o|8Y=YwO_vDYRCYHrF9y|3HH2(Vs1kJK2vrOvodrGX1)^RnUqT7~*MXH>N zt>ry%NjnzYkQV-=&z2_b<}NTkcF`Ws&X_{TP1{4QT($(<{eT4WA1wfZv0VPq&1l2V ziT(?R22=jA)7 zc3aP^3ZPe87#_=tt!%HgY}x(3eg6SDz24ULcT4NmNo$Jo{GC+@103sC@Q*OdagjV; zDx`g&kz4Kb5Ah8B?q%oT!2fk4C{y<2<%N2)a%|`3v4CW8)?4#QO@%i$mf3})u$ExX z$+P9^6Vj++@wK5VMDjOP!99%z9qhE%ly>|`!XT1B+-a$0%(3z20 z<>MU4)d6|~kxK&+f~3jq=X-e;#Q(1Jh`?PKgh~o!>NcY%!<=}EDQr*0p!oQx11qhFC7gfac*k}VAw_H_ zlr|Tkb!}eXn(hpJq=c!>RohY&@e&lKSptQz9v(}iGk~T+7bCDND~keOv*Zp8N3ry7 zwSr#xG@c%)5#WZHRGBO_Ivs$~cYfy3>C-oey+e9bl;QlKPw>0KnGeq~zOmMoZ!dnR z&qkY|wpu-J>&(@)_F$sZV}?J$yXXxX$=e2lZf{0f>l7BDKYt%?@;EOSl55N@=6`?h z!yEpc7{-=(U1_H86PXCidUE-c#2OA>=CZd(c&!<>)#hoFW^WC6#3N z&0K@T4bMsr*?X$Ha8im5O|Y*eLwpg*>r?t4G{KDfxe$RZxyV=lXpI`;a32+FSgjKQ zaPd^a*FCtw+=3Vt6qtiKfs(p0%>Bw&1XtJO$y{{Pw{aU9h?udkuoykv)c*bl65rQs zRbHuXO!yAy-*Tb8eb|}Y?g|`(Q6}P_1;$ksV&@VYcadp7r^(zj&)Z8|;GCXRg?pxX5HINPh!;6@FuTW^Aq>Bp5$*0YHD^Y}?s0Z#-7}G4RN2%S z(=@i+-J(9OR6{xoZO>*C`To)%$HaZOcHcBYwSrfLM4gxpx(T+nCw$xB(*zJm`mlwY z0sgNZ5?%6F6m?JED0h_HMkoo3a(Wuk%xvvm4ZEBFUTHxp>XN z*Wak6?{d85G6^LrH7f{UqlbuEN}bK?G}M7>OW=DVO*i zN5oh2+x7mfHWApS>V{AUYi$4wkDTm~pV z`4NzAQATAmXXoS?n_V|COL_|*18Eyq$`WuFfdPnSv(5q(O3uLpZg6YBuyAmoH~CVW z3>@c%JtR?sRg#}K3J_4afISjhVp=#Qsf+&P5}Fb}HI5cm6|sd;;qbqFVhD8pY@ z*VmsU);mTvil;6+dVol<`m*n2D)zi$WE*s(KqJCT=-@Q5>f^4K^3E0D>71g3R^Rb} z-*N}zKJB63-ByuzM92}n;pXE7GU3ype#6D>D2OJ{JC%HHr6z{$&t89FzPTO-NQ=vxvn&1PcQuARrA^N3hs$mC?l;LP z+(ubhG_6Z*ZIkFr{S%FMj+)JI&WMdnU&q;h(FQRfvcxP_JzbE~2QnFp7-!b6 zO})Sx$BtQRW<_|?(hiAb{Dj)m4wDM_b={9DY7++F#l~tWIKhRNj6XT9o`ykR{}^}N z`Y|EO@CeN2r_pAixaIe)F)wUu{)OSzCdR3G9;Xstj=`Hu!)++Su3_)J-d>rK96(J=AjtMD04YR@lh!AG>bO zwkLPalOMkAB7;{X1!CFe_1J2?Z~~%>51}o#c6JtbmoY%W4M3!~SIeeIn6(O05Ra2J z1?vM^Z*LJsjeQn+7x%54_U!!@3rW9DFIht`k35ScZ89~Guu zS7m>5N^=Emd4*t`K^`sfx%fN=#Dj@L*sCm?|0q4>{~86cGb@iJ%#Esu;G0MWN_954=} z!jlxAdAWf_hxW#rkB`sU`O0RgaR5KS-+yjlK}TCVIXQW2V`CrMlG0-chz7@yX5h2_ z{v8WoaOX>;TZZm(#E3a4P@-(4Ooc)Ky)HWlW8|EBd?$rvZno6yr7`$P=6f#HsC| z&t=-w8^M_LG9tq-NSJN^F@o_qdyrAH3OP&Kyx#l5^ksjUI5f`mUxUw|_?9kobb?;P zKkeY$zTPZ9-wMsp`eeQXKQ>a{M{){w(LFXWzzy}6`pPC+vZiI;BTR`5C#GFU$(gRV z*Aq&x!lIa?LIR#na&n{15-$=GlBK03J3G5lj4vJ_Dx;yH0Z^fKjRnH6%K?a7uc&Bm z1P?>VX@Az%xgIb~!b!OC2?)UZ0cN0j2Mcu|fPJ99A2=Y00zLs3I}oXJ5TjSKe1ihs zHnhnZbCDHnB?wIGkKGG%b3|IabmV4Q~AmtIm=&2c+9$uyjSD zN>TwBfYCgS!O8gc4i6u9r|E&E10!#?-z+h7sC&974OmdvjYFJORQ~)B$jhc09{Zq; z@wmqt^YdqTo}vLM-|rXKE^VIiL_~Ai+7?KdlBB%2Hygp?&DPcgHCE=;MtvyoFeNNZ ze0+VNBnl^qEY*E(1zmimD>XPceZ#c}*zrX%D2SRd8@HX*s-?(L)V0CgNi=o})c4iT zqF*WF#{FiB&J)fX;u8CO!Kt;ntZ96gDPZc&35JG#7@wu)Mo6?0=%RO2box++3&yOF0>O}jPzp}MSsBED z5FqIEmEF2Ysu8GFb9Sv9j+4hq5!~y6!%04c3*ms$TlfpecEt~!1~3QUu->1mLMr|R z5SZoSnws1I9#DN||Me^IGEq#L{QP~|6djp`kAqVQ5Huzu$r6&15kTUmr*{u*0-Kfr zfq}r20=7M%AA>l`Jr*Spr=Ld2^}yu`g;0-nOXjLe5nynz%o8E8iz`# z?MqJx(dg*Pu)g!nY1eB-8@)D9r(Zt-k6M2+H(VR^P{*?f(9zE2Xo#&Oo&!7E`Fe17%X)o1X={jyiPEi;#IL==Z@>;FtsqBI~(I z~N<| zuuyVbTzK*)o*Er4201?=zvuPwYN2`&a8|>sk?w=wY{X_)-~48navP_3rYP2gcs!mKKpul$4b10hCcv>gssfSCQ8ZkTUo49&Rqw)zCwE zo29HMm7Ggm)9yPB;-<$PXIwMH2RL)re@pl$nDe39`9S1tPIy$<1G%g@G((pC4PgMuk)^bJmeX5eJ zsp;U&7j6gXcA*s&Ftj_i;u zRBC`)xu;%}VzQC4J+%VoVHMr;1p<>&=%5NDV*fiEk<+yIfQ zsQ9xfh1<5(aSvZQ8ko{|c6JB@fRUe!=PNuNn;BYtef@9>_t^aWZCTqxBk&PE9-#EM z3qAgYvr6h({gwi7nTG(vEE!1-9Ju_}^VNVM37qw{2!fBGX81Vc;^G2MP5^zcsjhzV zF#-bb3mlwpyRv;_3~-xIjwx`b04Aca^>k>kG7`DLQz zi6`onqMqOb(4bgGvh2M5*`j)6EU!a!O={p|vP|POF@w=+TWvjZGw0I?%x zWyKUT056EfQJmpkRF|?~80~(=2*%Rp)nVZyCB?TugMbg(6?6gk zn|Q3m)rCVp6<4-7n}GT)Qrp{ssFFSeV-x2G9(?6XRtzn|+NoymaKPToe^lmGY; z%~P=SH#{%ifCpbaQOD|ti+7mSXCTv7;AnHRyyjhG%lhejG%S~D(hk?Kj=ZeP!FW#3 z4b6%c2kpiDIA2pd!q|E8l%rDL$eDB9fgzXm4diJ;g8lRpg(L8D;`zmXhROJL$*GW` zW*yCjYl|!na$g>*`YTz;cq%ohDOZp(h1JyKOh)fhpe}z(?9%w3Nj?Lj{naEFg4-e$_a&*Q@ z8eGqRr&Y0mTj~X=&9u>H0^ac2c~4noLAO4MvNd-u+3-z^^_*!fk9)Srzx$$Jm>O|B zLMIRR`i|mXE{z+Z9XtI$pJ78od!QhaMh&HMO1DoqhoOrd%+6i`0p?Y@F}U!c60oQ_ z(7$?I`|}5~(T^>Wu=dnMgfm0ih2p(8EPFkwv;B?zxH+toOHaF$aGMY6B5qlP&jP18 zc40@lH9O|A!jPpQZ&GHzSVu{{*%P?L?Y?d!bYbBB<+kwo$Vdv!YY$TXf~#7IJCw>$ zXFVMQ-FA}#hLRukH(h&bkxOSOb==r7)Bza{9}s#PBye$G1P7(r1TwkSrZkwqif{#4 zfrs%wr}_wUsfktlo`^mac1Ai569qEDlzc10j#NuL;5+W9pn?6<6{@9u}Qf8Y;XL_Ex%{5Gt z@yY>FUQcHPYVBZ;YQ#p-`gQ;E;2A2EyX18&op2bq%&tj9o13d1FXpi5of zD%7aBZ@7x$dBwg~v=9nH(A79|UUDMWuH}rF9M|!R1#AJN4KNvc?iMt`L$kmGz51Xh zd9g2ASco9`QnRm;HFTJPq+FG zm~FvfmM2Fv`k8^^(S#dU_VGU&RQ#x@3*oPNjE9Vsz~77$Vpa^LYr_Ei{FLwCWts3N zM@tQgG?)hsYdJ})Q@9SMN_4#5^)+dK4|Y}?pwCb`Uowcr8i*vr&3!9IN3Ddykd!DH zEnMD^!#@B=nfK#UlMsQ9=nE zi8;WxlFCu_hh423#r*6$kof|he&so_)P-2i+Rn?jWXu3Y1Uhd|9ViSSHT99|zWYRr zm)HID>33;Co9^W+=*b+VAU1m|4en9laZ&!(bqY7V=3LzdhBesxYy3a05LJI zaG12pw@i)#uF0h(kIYQ+zCHud@37KV==2}$%afJpeyYS}8%D}FCM!gf;!12%rmhbR z99diI>D0W%3GVmvn~5YRV|!!v*AeT&6*J62KMnpGiOET?c`u%o9yfR(U4N(Y*_-7l zL|k0F=MoD9WaYIx93OQt1vQVSd4*;RMa#IT-;fTC4jXXyVE!wAz$e?q0?!s1>@eA- zV92abkiL2UPG^`vAobToBX*IB8)%zk!0vQ^N!a&;|@3<(JhZ|Ppmc6B{GKTb3>oBcbPw=t4}nU}YX_7ZCJyEvGD zWl>5&!E#=0yFW@602^9cQGs_m+bJAyE`Zw>ppc@>&FumLx}eQGla)Eax|J0HAaHk= z`puhPJv|cRIp6+*hzu}?dv*fdWHarEt;jqI;f~nE$KN@(2W0Gja)ei4Cq*Y z)=m9pG|bCsm>~MyT@26OF}c<1-RW4*oekohf(gB%-@6)MJ?I?@c%!*51vE<`TmM^c z0hbSKTM8>(I(4Hv`*Q$FDhPS_yAgudbO`lue|*>khzf%i;uj=$iY6usj}K>giZ92@ z47Q`FmLgwXd7N&9r=+9@_DzusFaeWdmDL=FqT*}#RVhr0fgKM3*XOjn#3cM>JoZB! zo%JaPZ&&>HddtzMiO(y~@pZixC9GH<6*^=QcTPJvGcaE!jPSMyd{!y(9Xp#ZpN=ip_FHFKKR=?aVw}u#*c`8erzMlcE*f5~cxJmwAcaxMuspOlSuhhpm zlkZIhPQX{3lu!g@6$Y}mH+SEha^L#6KjOl2TdFXvA)_jz=!?BHaC{}%{S!OWuSioe z$g@E;T9#55+b09rY@Y5DxVuWy#a*KyM-Q{CdXuTs_oW(Q$-=IvIK=L#p%tyrvYwWv z%jlQN%@oMqii*^Cc4GGyjW`Aj;)9vnrVvGR_RqKmti3RZy~M^g4orlKcy@gfN?e5I zIa)q)9D(h@6CTx?@1Zw*30%9FndAHqVYBPo0#;HaJy0>>$oDvj<8%@2|2BGhttYC zV327WfBXfH!KwMw9y7LYwoORIK@k^1HKI+!K6%_0$R}!bv!^R#%PxCp%2*t%q=9Yy zrrJ3@G5Ni|&f&!utkek8=CRx3(-+|)wkMkG#`5urR#-;kcEmp(1np%4CH^(ivWkm! zwY6`Dra^J=pAr!Zg801U9%Fmqd!Bza%{uWrN-$DZp<1(!d^J5QmmKj*M9B5PUfQsK z=!U6LgKQTG2b+?NhN@4)ZG153qzo%C52aHD!Q!0?o#+H$;N3#Je&N5C2F; z)wG+BH`K^&CySPBX#IsURoF;Pp-RGi&|qlYD0uJh&Xe5EwpuR__>OOG@bNEUU}V7~ zuvv1=RxdK4jQZ2z`-)iJ}K%Dg`L@$tDyDT)quN%dyqYCqqEJnUNnO3Cy9YgY15yIol9RCzR~luztp zK3)4{PySSnbOm8gc04uWv_JJ%Zz;MPHsmgvEA|SlQ`MIgluBqh{l8yh(-_}wA-Ltg zJH&sJ2ZsH%sTtyJE^E-kxcBF1j#5!ls)`!3B3uGXS5DttY&|JUek0P8l@8o`C8ng*?2+ILdN)|Bus7Nx_3w0} zaZd-OlXCR}ZKkQ|e;E=gnzsb1q@@k5y-4@}WbV#MoILh=J6TP;ImSMO*M@Hh8;c`M z<+Y&PF;sXV3Z>g^wR|)H5AP^Jzn%3M&8B0s!_dSP$}oOU zh7jaz_UFSZXqkT6#IIj#UqZse0+_Ww12TT|P(oI&+=+r58hB%-Mp7!q$D`uoYr$ol z-@&YFAcmkX@~YXx%^+Ry|J%~3vzh}bGU_WN@3FKY*AfhPU`PPSs4nfhbvAk&*2!tr zQs_wF(MU^lr{s!zJscY^j!@I!<&byr7VQ)o2oO&%)!G|TwGdR;wz`k*@a7wxZ@%-z zmybSd!$JvqSZ_4nvxFH@kR5IIT8;YtdY`{NQ=Xt|meCHLf@P2q=ZpV=Tdkf^84I#?0rnvd{VK zo3Tk5ndm0|us_3iM3O7f#e$~jZe4JA((ZOSU!Ue&#g{c^<6BwpS&0b$BqtIA z?;dEMr@+8`YMK= z;M+FhFz?EpL*oJu^xg}Fy2Ql9S8bKwHSTx{hiD)%f34A*87Ze+5+y9Ajq+yymjOYAHBI>IZ2IV@ou3C%p%6uF=Ngx_$Hp)g5?n?W7HNeCsVrr1)yh>V za5R&Y=LDBMUZ71?RT}ttLPPaC`2z^;YO-~LlDSdY&0JjYBq0p-TK$@W%c~&#Q`cBq zIJCm)A^BlUd+)o$QG;swCLc>_cmr|5^~!W92}!BH1j3!)QB&2;_p6HP*kQ7S->2`~ z%Lgb^I4zP3LG4aI$cQ)-TFGm9UrpvbD$#V1I0Ga+^kafnKk8=g@-1~_*)x7V^c@8L zw<-i6HMJx6f9T+y_yW!Kl)?$ElpRbUW_PGcLhr5ge8esqZ-lx-EkEv2`jeX>3DOn~ zM$tw}6r1LCLwl-oICgWQYK`B&KSWA9OUMuy8V}{zo6~rGp1WN>RKn+0IG>E3(yR^- zz4iy{c82$>m62{N9u*S@? zC?y%=XwF_qON&zy{YLMp4(-4{MIfj0$ahGNDFd@7Nx}$abI6}dcH0+%W?U+YjaQvV z(RGE5r=1-|NDBL$Mw{J?8FekMZI``lhc3$b>J)@@_l26|U#pfkR5n%~$vt)ga{{V< z4}w)N1oI4*q?Bi^*XRsuAkpbI?)!o%raly>LR%v3;$eClZ=H)}d3&)`Cxw8GRVOiJ zjib4ANt(31E{T>7RQj2VVDLkohNa~dbl7ZSlH&`NN#J6ASmp)oCl{Aoh;H1D ztYFth)J*n2cwV3X2(_Q#2}?B+!^vjytvgC>uX}#9@FA%s$YZiHYv2?PHC+(0!!|gx zT_3pl43(k{+KcV?zt*)YSli3+A$73k$UvOLGQ`IsJN!E9b?ts35-|4lyEfY{Ls@Y= zmKBw7b5u@F^Qh?`pvlpwsni{V{kC3(xL@8yQ`7K#Cz;>?L@0FMLv*mPwPR{?8i7in=1qN5WEy#kdax%cA7+b=f zGT}ARjd6SD4xxs>00;6(t`viC5<%#Ig%jx5p=>yJ>m2f59~Cj`(6zxwp~QY3h*V4) z^tb_<*>r|;$WiOxV~frO&KHOdZ^s{R`W_iia*_Q-1q~qkvzws$6M$f4=jCN&cEYmi zHsRl2T3f95&UXb~Wsg_$3vi1|2t+TvDkLZK=H&Ful=gf!F}e6NKEBv| z0BF-~?gP^^GcBG1lB%jBziW|vJ7Q5kMgVZ7<01d_VBtZ&atlbFoBbJ(EOt_=5BnDm zzljA(Qlm^=d}|HVMJV>6iA=O<(nZVn6J8i-eP1W+Tgd7_t_6@x)UzEJuz8K1pY#L8 zM8_K^kqnu!6Hr_)zdfG_yard|*}I)0(wAB<2rxkTaypjcGZH+<5sSn1)0_ld@(Fv; zqZ78`5Q`P<&O(6`MTd~V#>Rk7B|iPT9W;8aHzoOpu6)(kR5Ua#u!hZfJAoeB|7C*$#PwG`JTO5hSMH={M`gZc<$m6L-EQm!uNusj<+V+8&&-$n=h18gW#a_VphyJ* z@hd7?-kZr&SLbYxMkl$nT5xihEho3LD=RKpKtQ@<)!DM&R?u}gPeUcZh$L@02TqG{ z5=)+U#R}5WRpn+xL@wK_T`L}U3f6O9KmYl4sK$U150B&q=8ELi?K$r|x(z!=ZOoOG zPXI3mTmgYUFxG& zUppHTRfzi5+fJO6Z^2=L0`Ut{%x+;}pvU^dVZ1}c82a4LhZ2D(b!tXp9&zyFK+|g% zfY^FC3Z?DQmjA$u$Yke#_I4%}(k3-Bp;Q{oaOeCpGuGMvZ##ht6DfjoGR=Wwa-ovH z(4l+QD_lgp1$!Ek`+*cD-jdj{nUC}rHkSK>nVPRBkIp?cLE7kfdASrf_hSO9BFJbk zHuhlSA}7bKHdzzzyE;mDIN$}N*Ck-k=5c1RSeHz|V}xyK*`Ax9Pko2Ix>6q9)nK5e zHeY*i{YwdOVD~qNoZr7kA|WEeB;#FJY~IY3AHwoumR&nouUZ~s<)B(WtuRWszwi1X zz{i}h*+lVVp1P^)yE`<=EEF3lLJN@zh>GggbR<`Zo7$rM#JQxshcQ8Z#9ubB@UKxA zSdiVl{x!*A-g#0><@4dl_Iq*hFk()baesdfho6uHRkcBvn5da~rT6|PX&BaR*`!y2 zNouJ$e2p$h)}Rx+P}iR;FQreOnHLCgp|};=cVx)h0@{am{lx(^o%F;6+F*Zi(^X0s zX=Lezo07nHHI%`-=K0{}Lktpl0d=!viWX4xfy5)mKm`Fv=jqF`>_?AWFWA^3W4TBk z`=Bn*3@!{(8-7Jb3p{gvyHz}2GtsK6D}-Yb1O^^ET{B=h#ShveLau@a9bD*0Bf8u949 zCeSFgOCL0yHgInCV6 zdGLyo{5^#`*VG@<^6eA|&#GS5_kK!T%80gFp3YS}BD9CdE)%i183nSO>Bq;}g3T4? zt2K$rq$E{`b7j0ZZ!4#g#>VivI@zzM3+N;m&sODK1($+M#SIaj6ZW# zXM~WtCSDc@w^s!B?T*d_B>#J7ZU7U(Uiru3r(m$b@34YsE|RyJiYOLO~P zp)JAGhJsvuIU3$rYN$o`<< zdWEU<{G8M2@pPo4^STcv_6a3=Z@tFRWM@ke%s|j6*c7Y@CsdAUvFvCXYFf|8)!j3to$+=ibeh*x zm-3j}DZ4+VU#=*l9Jg=eY7ov}B(3tqD)-So$39H=AgE+^$+zRyVGaLSMlfYT8yWt& zjlp!BC&V|q`?KFWtB80P#WeE}QWN_o0CUE0y895@(eA9%0sFdeW5fN-gP0zy<78aJ>@h&Dhvrv#* z05Cx37X;#xx02%B`fU$yC=JU>4^ufUO;~km|EzQv0XZSBeeY2F$NWq~Zg}j=D*!3m zn9k0wg7DjP!fQHSJS@~UWUP2Rdh&m9lb0t;85td71OW%0=X6+bM^2jWk&a9toZHUg zXOMT~ott0l;bF-s?RogGB|EeFyn>NoW!6rF$YEzT-K}V`D1shcJGiRtAh7x?Td=EZ zSlG&g=ef%GSoreqH0qr}LY%B-0m&C49BTrMs_}9-QBN(jVFL_jVTZa4O}p zByKr$8=VgFn~)i9AkGz%a6iTfp0}~BwV6-6WRc}VO}2?)ex3e#3+Fkc%e#eucu(#( zasSJ0nqM<3BkOk8wR981P-Qdx38;yDVw?NHS_2(K$o<~){zghxPyKFOEJs2D{)GVv z+z}D`cM@(aV`FLxm=CpRCud^Gony=JP->J1>F&Ns%NPB)f15q#iF@_t-*5zbSKth zy-1kovr7wS_2_&?BXUQNXl#SGv*lF>wBaupe;BvXy)>Sg`K`Ln4oP<-sO0?M5!RXc zj7-W`s#7yz;@Of=u4V&>&WPBNDf)EM{==C2X5%q0-rHG)?XMBryU3E;JtuPt2tlmo zKuFzwDh9Zr)IZL5)oj}YqeNNmgpI1;(C*&rzTQROAKzT)AAvt&Hx(sdo_{!5m;W%Q zfHl&!F0Ay688x&b@@3opgv8-Qju@k+O`{{DAoD>|TSEi8zvx@G z@tgC4>pP-thOJO-e+hDMek`a^I_2_SVQKQdTM`>tXum>Z&S={^!03-#i7qvEeW* zKckhUAGb{%DNJ$-ZqZQ?)(>jB5Wj}n?9}UnGd8LbPCQ(`Jb|IV=NVdk4rcD3Rjxcy z2JFh_$hYFvI`h7F7_Qnu4jz{DUhk3=TvEjZ8F*h6T_L219EL_OM=;}@-=kn*c)q}xdouWqRlvakVD_pHxigh)V6U3v9ATM81izPVXZ@kh3NayBc3?uJ$8^XJ7P zO

tjpC9%WjaHkzW@l*+}tdpu;li3%j9IZKy(sqU0p2flUJ|v+JAjX3=Z~ma#C=; zMPg^yBUJ#Q0gL(h5FMQtY;5DKH*3E(`=HGOe~(vVDk=`!#Fvl}q1HuU8-Tpy;(utr zo#Yygnf%ehe518yXOc6s?;(!Uai2Xj_&f#CoRN8v-!cQrU`Jn3F?o^YW4Q8Nabv4e zakGflmMgh5a^)nlOz5Ji#&~AUmvvn~@<%QsH7&nQ<3oJI-5gWI`sIg46K#GB&2lCp z)9_6${~updMvJ@KN$?P%`EQPU+es$V%N(~Uo%LOWsn_Nn-#B{V!?4oR%HglPP<(@5 z9MO@@u4SZl7f>%JWYBKsnP`9sm5Ad6LE3ipx+`Z}+47W80H4;juwZ&hZLj*mh&S;& z9S#|QsJvAwJDU)rs<&_=ZxGdpvMK^NDgsbUD8?4Uq40KHY&Ze0nW(Bb>$Ar)+zD|=%fLc z9qg3|D zJs3bpz1m+}DJ3y^#6hdxTh_iRYa?uq6c-4W9-24f(a3qf-D|{!*^J#WxbSQAEs5~k z9aIMPw!fF&1Xk+^3#1)Foe4Be*dn)oSX9eHOOHm(%Tu*lc4Kby!0kt;(C)Z*^~SeX zQ~q4#_n)SnsX{#k7{#MzhjJJ!<1qlw1knRFB1(h&Dk-*fgRbasCBa~VbVHp zP0|#zXJT}uSzqQLEG~I+zq{=Iz0$?T7h#JsI>fN(yc8;8Dm5VUE(IbrC<95;`Jk@h z%jFfF)~z?f-;*ugGFsaCzt<4^Rg5TkUVC173+lwE&rE!eiO|?+nNj&Mg!C6x+(7IL z0#4FJw~F9VWMheRO=HG26i<_gtEQ`$VFzT~OnPsYlONY!x3Tg&3XMa*&^7X6x)0k4 z5n7BIQjl;{HaSti!1Pb#1m)0xWl-tFTxs-vo#?^lSc90Q>2JMf3hT07nPhv)hzZb< zA|XARe%{^Lv1?d#-9`MMt^I_fPKj8~?YLX-i?rVw4vvT@#q3x=(LFRpH2^`!V`H|z zi$8c-MLL7|=qFX73HKbZ>Odk|(6A zZU;t0HB5q#qua*5yN(8Df(6{>Re{n{DzbM)i3Y-w$Y0vNe5O>^(n`d@XRcDl%3?o| za~Nf*v+>mTVup0x>GpjNdCu9gOL+Pc(iWh3>V~#R+j(!~F;fy#oS7L=dZ>lgydWO# z%k3S+%AD`OwlJHM*PN+Rw>a!>IYaNdrTuJ!oZYEUl7Gp`uH+$mbyFg)bnC}fRj-19 zAe&HLzti{i%kMQaa7SCs5APhNe%7(thgqR*H_esS&g<%ZsC(?CePyjIt^Co7{)1ha zeNoYn<*)tj9C5u7Zt4D@Pb+#aBfPm8(3;tfd;JQG%&Ysx{hVLkbm^X42Y0E(N6e5F zOrQ#E+*d*zbty#KY2^2W&bJd@yl`)B%LqF1b(TOdvV@x_Oiypmcz!zN!=2itd7FiFh@Ex(>kLn56&d>|^n0$EON zZigVDptGwBQ0m)wCW?8WAcpJ(t(;lVNxl?ybXgfNz!~Z2t(RdX(tWvM=H1LoEqdV+Ht#T5bl@sOg(2?9@S>xAe}CyX=MhF8F3U#sJgl}m(x$Ly*LA{= zcr*9!*&-a_56sMr_4G8uZPhNNL>A;k$HnbV_Wd8O-a0JG?cE=yySt z5Ky{H5Rj0Tkxpq)k&;en5Tu9h4k^hYBn5_U_$|NZocEmT{d4c_=E9lvtmnQzX=<^` zEGk;uI((A-pb!7}@<~jmslW$n)ySXrwPp|%T!k$Oq*3fsUeQ;)yLhO4*kS6+BCjS;D_we_SocB|j03inlGbh&Mlmm13>NarG zRoq!uzO?ck$#}4hL61~TG{Sl3g^Am_VM#sroVYDLW5dkM>;MLH-JL3|>H-A_@lPaJ zIXJFc6Tj`xKyz|Brl$p2SPHFRW0?}mOG^uzdjXlCPzS`Be*EbGBR(-a>5@Uc1-)Sx?%&JfaXYEmPVR7iD3mD@`X}-k$FK&p3g^5Ny^)H+=EFR)2l07Ubsx=aZi*%50!7 z*k||-ZJS@EEk0M!Os~x_1f^#`ez*aBuYY85vAwBj2BlRh$l<^c1Uzv{vl0i*2XYTB# ztvb^C|2%8FJEv%r43iJKo&Hw;S6SW{nxBvV!!YAWAA>f5?ejzW7eqJ1@+#Hmt}(|U zpZ6-t^;Cit8E0g;OnFj9ZM{F;{u2I@Oa|o0X;Fg(IXR~?*~TWjyl5^2N|ARw2MT+M zI1CImeZ`&dkUww8ai{iPk};C|bC0S%iYf+9V4Z3p*WGem9I-Ce{UT@$CedP$L^L?d z6=acat^8IhEMM6EJX?gfhQc#>Xd=LgjM`Y+7*}d-1v}h4GwI`*QY|aNG39uz3bVXS z-O5gg&0~B3EwqVdEO=*Q?h9jYxMCE9F|oU}a883x`b}qQ(8aKiY&ooyLIVe>F5T6a^JoU^S?UM~Vg?BNP-YT68k|ICym2 zc7z8OEsM>Y#Uj>6!#c*CYIsRS?i*C-qxCKs@bol#%Z zf8RmMUc3J|#QHoy^2Z5fVjtm=BH0nMsh)gUnI-2Lv4x*yn5j1*St2?I0_0XglGD#% z!_%o?j~#V^7yamwR+p!PqYYJk!K&ldO0J@_E<=+{DW;d*|1S5mg1g7qo^ z{fMUJg0Uv22EP2ACeFNcy^AQ>I58c6W%q+itoO^B*}9YZ<(p*;b>aQlvSPD2zpQQM z7|f_Gbek_xqsBS~O8@rK2Q~OsfSVq-AysYDmI^5F_49)^HeOBSDzGp!FWfRNXoG&Q zvBAO5pviZ3_T84a)ad5`h`u}r?t)k1VcuNd<-4P@p6O(8zS!89Zf@>eeEcN&UFqqd zYxfd9ByuV7nlCocA7tCXh>Pf3Ym~9{b*OxkpDc-qS?+HMW1*hHFL0=Y z4R7OBB!`G}WrxZ_Qgn$voM~K~YNI775VDl+3{j67?B%nL+Md7_?|#AY6XaL7i}b6F zOrLSM(|M&G$LRT#s(GYZ)-gg5A#@%A-x5s^{Y+mfS!BumsrOX9A_JHQ=7ADNQs49P zTeYDW0^>6Y(+I?))Q|-&E2BL59&sv(LF2c@rqrR{`R1dlpzQW3_iPZUma*|lg^Asp zKUw_=zMhBNkG+LrjPN1W^}m0;9v^BAs`3a5kmLp|{J!LmjT6e*i7HpWAaKfWh(2Z^ z575TK+K~3=RJiPq`xO-}&d&a_zfswlVWjMX$QJ_pN1MJ?Ch@|5{kD>8&X9G^g+<^vrbClW2D1J_hw@4zWS$ zg{_HuJNgO)DHmE(`fG>s2{sfITLBjd7B=?39=E#Ry`V6uuDUuX4wzT>_v())+TN3- z4A<4xf{P?MDM>?969n%&niR$cUs{yU277x7u5W!TE@h0sJDxj!etygpD0K9dNRQH{ZeUrp9`uBXqS7< zN3lJ=dIA?EBKMZp)O@b}rTydrl!AkB_naJh5vvf|w57pO=fzNT%$M0KmOr8^EfSsx zIi&vlNt0y!gp}06(6AJcq@|@_GVdRLeV=-_ya^m#oKK%xy{lwnW{$;srF^y0``p)u zCqvj7SWdnsCx87y^q5jiE!XM0iIESJ!Gtm;@)sS5%@KAku8s#bot{*-il4UBnB3BL zddDcTx{1*UQ|BM*W9@zFUnAR2@kd>ue@stEp^K@e7CI~Us7=nFck?>>_({XS;G_Bk z1%8V`;yWJcCTN473#C>`576Q_CriI?;gM{!*_U{cpIxM8P#Bg$u1>b`dHV@#x^s== z)Ci&Qi&&bz!wh;3X3BNHqB7l-o3CYWgkWND1 zb*$p9V1)PynAAS)KNMNnctSzZ2psL;y)j>3Uk{pe=?O4E->|%Y1!zgUzPVXgTnseX z1lSM4Lr)Yj5Su_GX^GNg0;Rj?hQ4h8TgUB%v$eG~{b`oeE)5|Af!j`j z8D3)lhH$ajgBDG&%pY?6Poq|+kc_I9v?!YgdM@G6ZW$1#IRL7SD^!)zzeJ1Sj3w@> zbxXwjWM={cu0x;rimR^@k2#Kq*K*2dLQVz-RJq0aCl?pe5&?IS;H2&B;YoWrJTBCk zSidIP*s!j&kdgA*+gnbTZOjSMWdJEXLBwd{ODKXt{u& zy?yC>yNe;OkWEkc`LnSxa&K=qCd$-oMU1W?-qd5H&`59t zuZ%|80Xd`M9t1W@=g>BEPPxrWZCtXtUugj&4t|W$>y}0OEZg}R6C*eRZio>xXm!7( zU6&Hc{P4qV?G)f3?^eg()`kSCx_W9F#^}_v z%EKunzPLTp9ozrw-VS|arN#a)6n!`0dCNYXtE8}&^<85oaU6OAM=7WO>8?icaJckI z^lH(#jX=fy0bT0{V}hXIOWu@_kS+r}NLv{&(*gf#8|f}NutB1t=Ke)z2Jyy!_Vye) z7C>oRAn7Rp5CCGDl#~PrmD0Yd9 zlY+qMZA6?G1cB=n8ynkF^UNOfuc@o!+$DYbL?P^CsL=612K_ZP1?vKa>3<{lCPM)f zDhb|+yu2>~;5GCMe%8~YDEb4mZ+dKxfr~TXY=D&&OBC8|&+@*%u9> z^5qjN%F04p&`7WJvh{p?Zh%f1wDti;0sO+iQZ`ZOLyhPX`M5bq+}HtXdMT)=>VFE* zARo0GR=lsAHLgR3JZcN=njV0nI?xOcDyu`uh`v3+2=)#HsPBZ|6KTKsF%ag;CN$Ty z@UL+B%BaongZ}^!U4SBBgsX~zg6S2n)8QF4`17l)UWLb_miBHz$!Ndt>!_l`j_16h z;1D~jDw&VR7^xo;4?h4F@%bo%3jvF$D0sH+t1K+OBp>Q(pQviNtq&&B3JcfP)LQK9 zi~v)xCFCl!%zzGL2>Bf>1Zkv`X{K7fzjjZJ8%5(h*V52f?g*aU9(NfV=QV0L?AYjM z14Xq!>BDCec`}yiNLWDTAIbgH;Bs%qYv`*M1A{kxT~Chy*iAza_Z}l{bkP!SnI*~} zBPY4hT3fNZ$!QbH@@v&f_h1hQVcwGPcSB87oHVrkF4JuzBjpLP?>4SNJ5g{M4j=Uw zHnYP8F*^Mp5@b8Rvp_4JR#942_U8CMAfmqPF(5No7WF^!_u(;$l$RpL8eOHCTIm3AeN6@lZB`LlrHvSrK<)y2Z?!^}+C9IIx= zMD$s(`{ed$Z|@&q{eJT%d;3|i5ugTDzjVwHWomkRcp&TA1{#%;L=wspVvyB$`k}}J z`3K1Tz)?gf^90SVpyc&dD<0thArV&r92)^gveoSDhK9@JrzSynlded?2nhJ9xs@OH zI{)P)e6p64L;QLNRN1n(Ai(qEk)^9Ee5&xpq$Z~>sAT@GnMp);b_fCjK#ZH2u`yNb z?v2u$HKhCdd-7ZX(?ITGzv6#&b~kBh^uT^vO#Cw%p&rl{0YLG0gW>M(ZmHn=;X`f< zczD8JI6|7geKS^3!63Nu?xeQ#1)Zg!*S^H%zNe>$KE9&D`P796ycDYU{HyAwFsad2 zWJMSqH2fbP z7-(s+gY)$CZfT{V0`xG5rKRF)#=y+I!9+83{KE$4MJhxHQ^*g(ypY}XYOS)o*MI)h z+25Q+SyI(`pFau@;%E!_a}_d7DU(_~5=yN^2DKs*iJH;|d0#EG<3iV7$dWPYC(V3e zS0r?^g4qZsY7pkS{#{+N<~@h$|G@jnmPJ9=0(gehqv%(5Y`g3k#V4&6_ zOT^vnK#w2Yk~PDUvjEsXF{CqxzY-wN40CjJST657$T;sl9Q_D0dr_4MbJjCAICj6amc)~J z-1*WGA@||IB@62BK+Lzj^kqZ8`@7Qj-q>@R1F7+GW4nd^j6Hi4 zoy#wA+rKW6;3YVg+DcmYcs52*Kpr(aecd3L8+lFo80WrmlEJTZv^cCR{Xv=vXb`Vs zyayv`0*x5a@<;S>09f(x;AJjh42T0K{QUgSx5QYV9B0d)iHM*}gS%6-E1Q=JRJYm( zIV&hAsG!RSMbFI4APxjhJn1ojKtv@s!7zb_l_~C33LN+0;izyn6r^SalpqI;aZs8{ zA7O^D06NhbaXY9_^nkERSQu>r^ok&m7(RcFHW0Wok^71Q`0wrPw$IKy5teIe_}NcE z=P}GP=m3=e=DrCUMo9(Tvp#R$usF$@BW%V^Yu}F(1FyHMm+S+u87rXt5<+a>AEYEK zKgR9_2|4cC+C+c*`uq2Hc9@x&?HwJvc`Q%Au)O%!xxI@}228^QO8YLI)WrZ%r(OVE ziX!soz+5cv{Aq9#9iLV(FhBx40_9U&J5jx!NLh9Z3Z~B^=c7sZ;18-o=uneNvY^X$ zZU9Yy@ySn!7#bSFs9&krX;RUlSB}ym7+laN)tP69flUefQRYuSgjt&i_vB1dGUr?de0k~cevw$b$d6}X z(_Qh)KAs1Z>7z3fbb>DVajT_?LV9&LoW#-|8|AncVswbF8qB39-^HP>QhwySen@Im zIJCQa=dxQe9VbhFRVjIVb{L~@*}(KD``vp3_9XM<-R_Ubr71NI!svqH$W^H7^S%3k zuhU!AnOU9G-Ai3XI2+!h1+)Lt?@jjPwH^}El z&}IW&y%U=kg7Fd5b)%li4*C^qoW~U5 z3guIB;s&)%N5;RVa3Uf#b@ebCEqHQD3OWwj`oAm+kW{9tOY-TdB3d!pPkk*d#C4^X zMuuaMN2H()f(rEcz9Sy zNC+1P2jK)AD?<*XVo`TlAwvQ>f)JIIls=)--+-|nG%SPKdg$^iB1Kpk2RFA()#er$v@72{Ky>Gy(g75DavI=cdXZ~_< zAW~ESjfoNqsmua-@Kz3c%*=|1Q%Yo0g2J=`H)q1Zt99{i2@CT4_esj1Z}%3S8b7~Z z;fy?Bhv*D2cV#>?mU>gT(84{cmk79520BlPB#J~I*kOITU0!#B;?3%BDmO;X^{az7 zEnA0P;!sE#u(u*Bht{1)+03Dcnu}3=7(80o-46I&a^(F~8s_nh2ufOD(Y+nPLF^%tEGb3#cGDf!k!UH28 zdyuYR=Igz7ckm9p!H$-uuxg(MyT~LN?v^%dP)ijn+ud!i-LF1op!Y_g^r1M{&*%Kg zK06}$@T?EhlHL_jaQ?$;x0VRm+VFhNzjf$0Ur z?+9zd0CfTe?YzW;-ku(W^Y0iadMFGaFCbm;Em4$W$VqB?dj9u`T~BtjU|HFFS_RZL z$T8#7r?_82LcrpIz++@)CiUzYVe}0tL;-XPFqo2jf`VL>#KACi&_;zrnpaeW=8aW& z$r>JfNmtf7$WpeB1y}T2{~174VSG!5e3VUswsOG!Ki54z^q5YDtf&9GdfIChR-K8N z_IE3>8k%FeuRehsm7-V7+S=NuC(+>js_=a-L-K>3iSMVPt%>#ZyXtE6-@iZPD?6rr z+p~M)ukv!7O2#}vmLP=JQ&;ycmU5%FM-la!h(2a#Cv0Q{pNa~(NS#t?6aRd(e#cn6 zQT+UG?%QG)KVkOb))yxt#eg9vhS!mLX>JHpEK(VB3wWp3=&9+y_Nb<&S>W^c-Ahzo zB-N?NWaGEzo;%JW-l8&+czKg*WGOJLK{S(R6? zaCDKpP%6%##b6-tQq|~qbX(sb`Q*E2^oyvj(Tr#JFZ(cX%dW9Ovb7X z1ynxEH2t*xsZgQI1oPiBUzdC&5YR}XJ{&ds+@gxw8$Kcqg+&?s)X%m$@ZYt-Mn3v@ z;|zO3PL6IGhnCmcU#+F1lbD`fR#=EiaCOiGfKVjGztbheSSzDPvG-)qRnU#)cLfK? z!-vzYOi?$axC@XbX4QQ)zTM`3Yieeu1xMpSHIIvqmT!yoQbv)ULGYhk*>LgN9(%&X z#6$uG5=PuXHPz!lYANqS8u13GD*QQ0)(l971c+-O^RHH4Z+d49jZq-1Mqtok{|fGI zfA1^|(`AswG`WswhFl3U3BM*Km6VpsTeYDcfn(eKM=*oHoEK1i;H8lF$3^=;_hyP| z&50XBRHUysDe}<;&`AB?gY!=cq1d^%WKZK6{BHVEN6PsiP*?L)jxg)mo=3iY@~T1A~M=noJbn9wOSLp zIY&4a4j}N+81Aw{;w_(f?XnUc+pMh%>xv9k+wcCVWq+EvM($^S-ySy*MZ{|#$P^?* zRahMbONkgBm2*~|H?0|&nR9V+G8{pADGk%-aKS`O1M6+Q_KM_wrqia*muN?YJc$Li(T$fA#D8 zSEu?x|1?_ww2H;ZC`+Ig!K-nc-!QYN2cj8nFvX!B@t8H6w)%Jh!ME&iP@8P6l}m4KyO1I0!rr#kgw$Rb1_0gS-Tl4YNGjs4EKR9w&?V4dJF{|H!tVRAfNT^DjR(bn zh)^kFX=hDtd5R@b>OX{L;MLlM05zy^x2iow=~;Znxyh^dz0E(V?kCPO_&Z`DycM`y z7qKht`;}?Vy)BrKB&~=DQ+tGVgU0(6HH90TX{{CEaqsb&IAo!uWJ>1=VXAfL_}Zu2 zU5^&|P;ER(jD&U=ahrf=XrB9JY>)R)_l*y)l&+Ef2KBCmMebN%Xf5mmcxx9{_jcD&XWzZ*3fbOxRjiaC@7pseVk zIc_~*dnXFK9Vxt0Jj9)){hI9i87i^u8dX#{ZAo0>W~IQSea2+2wZ%>Xz5 z;=%%&Kc$@~NYeseS#fFU&CLx!&-ZtC>oES|L7twT2vDuO=O{9Kh%?9&NJU65@*R_M z?>jyPfYqVHqt!zwN5?WCM;}4UGfV8`jeH|Ou>fz2=*EB|oROLN=FJ;GqliJ^uxx<& zd;rs4j(yG>!JdHu0DSwpuDwbHazJ2p0CE~Y=Yb@wJ5{SU#cb~f+`vFJG&Gc!mIivp zIy>*i$tGIb+N39hOyxlUQIw)_z0+@yBnvAA31BRooLH)I=3wjsv!e3-e6XaXWKM>{ zN1Wq}3o^pD$&@0Z%mNjWSQTaFFOMU1Zt8bKqjvv^oLay=_ig2V;>Ti24({t7O=36QF6^`Tx=#=jtg+LobUS0-#hi4 zdQ0$Bct1NYn4MJ}9)9Im{NAyMN`0zub}HT%m2Jgem5kAmszl=8+y3_Z(!A=fjkD-x z2Tr3%Pnomhwm-^GQ@)Lx3dIetdA~^v8yNgi(hm(ORVzgheXcoCV4+c)n-tLcTjf$T^!D|nW<@cVPR5wI`$6UBz?^OT$O>Y zE&|c_q9Sefui1!40O05p>iXE!BsL*5jc9mr?wc6mWC(}xrKJw~ z`m?~XMG$P_1&|usm8B6Zv^ZokY(~w_%cPv2*WFPiGk~>R22yW{J#5IZcX_-zRoPVe zZ&(~$nztaPe*qP+{W4Ciszhl%3+L$UY_J>qDdBzS`_Lb~A zs-!X78YLFRd2YO5j03!^(M?}i1X+|~E?l%j=dQPNl9!kyNRgFjuwbf-UsOp&C34UP zDUopMco#38tl` z4O&8Il|72Ie8%^){mG{lfOZZdAvEb- zLnkSsCL0@Degr*%W(4N}fXZ!cSp_`{3kv}#-to8-Wv^Z>{{CnsGbJS=i-n5|n@!&N z=W=&=$@0IeV4pnrvd~nY4{b$*qvGr^=`ZfO;%_{(m5jZp)@8jn8k5Vc9k*;iC32hM z@EEX~#APO$fgh5)wR+LChVl|$nNN!Pk_G8T>Ybr)!TS2h-5n7w?me$gqY39y3grg3 z98UqS(VfJlO8rYZQs)D$`B$gHS{?v3cvO~Vn1KNwDbU z2C-~v<)x*0uh_x325>#|3A}aWV~}gAkkjmgZHk6~0NAH2wL%9+NAKM!R9qAiWwvDG z_>0|N=u_nh_(nbb{V`EdVB^Rg_5*vb>~)%YQAh~V-eX4Z>%xKp5>nF6_YLos5t>vB z#^Wy;coB@jpAC91NI*|1-)PDcHZU+CXk^KbKrd($jn{s(+Wu%ji1390DYP@lGzV`| zJ}-pJ2ZZsGkXDj7aImqV4`JTn8*QMDe=sux*-=oasku3=*%>v^NU;^4f|Agm!T?1- zCr%(Ll2P%4$8*ttf@8K5K%Wx^&pM8r068Q3nbAxg9h?HsAle^8v-BH=ggNEKZ^6)k6-f`xu zhtPTqdDZsVGDw+3q631VYBgculc0LkI zezzew|0h81@*_6K+Ky*)d*DD4*X7>c4wG`}(b6t63(GNYlVNv8SzX~3taVB#%Rg@P zwTB66GXk1r6YYDMGO2fwm4ke&8>>dzZBLaLiDlY`8)#vX!6N^@W=g1?M*fUHI#g9r z{_UH^pv&$Ed_Wy96pRF=>_GRcn!j`;!(2smiqz!{=5O**{6NO!sS9?WZEY|_tF%9? zq~hnc(%`p7B)(%J?j%8zKc9*zVN%#~V|9e*$KUpj13n1*M^Th0oI_4iYviMAb@A5S z!s=g;cBFq`NSKL=<+47Z@%hZ^>HK?hB~VV1Cm|wq9Px#$_!Cvhu!#OOw-6a1IBz%B z2o`C}%j19cOp0@Furo9T30bP;ebU^tZtg4QowL2WJ4P|Fx0sk>2)i?74gEdLlD_Ye zEyK01dmDaxCDC``@LE9-0txNuijj|I8X>0i^-3dP05 zz(@iR%6}L?^?TGuhUnN>Nsq1AYWN&Rj}8Btzo)CK{SngZn=+DQkj&^M78SHuRW@Nz zD)j3p9dYF?#ur{g^zCg_i}&^r^ngnW3Osd94Y_4%#!9AZLKuT7QPz}_Zqw6Nn) zI0a#2R{T4VFR$>Qy=EiA=PtNt(^qd-1pAD!khRz|NL>%F1)kUYi-1cWQ$s^6r`~iC zIuJCjrV?pjFefhdo7+@$eAF2`=rWN_tIE7rJ-wb){Kj=WD;g2eZall4g_RXW?+I<& z+}x$S;rzW-H;7L?2cIIF;I!}A&gYQa-JZL*%TtR(C^RNdGNOA|)Vk+Eehn_llVb4A;^TkLC<`}Z*XxPiuyE1U^QT>pbZtfP zRAgizuJ)^b0mx&00XW60Gtjl1A7g9X6}D>-GrIPyDaa<0kM`~7xk4WyWazWH1kO^2 zdei2H4nvbdySWP<>>>-W@ds9XYls*5zsy{6?k~rd_fOhwqW;WQWPvmtVz%bZP1DoM z_A8j&dUu3fU%VHEWMe{A)pyH+)w|f(;h`Y~5|WR%*A;oc(k?Eta&ue0Yd-m2A!U7L ze_+-!%b@bKfR6Io^2BDY&A-LOnCcnDh;@J0;j)CFXYoRN&kEoN3$GML4Vt7kmTWu% zs`mM5#56*> zY}oX)x0@Rn-y%f=Bqcebj!3QdbmxMRZ{ID}KSx-RXJDAquexXrSXS24BbFNw7RD~t zy_qTNCoE`gb=+ydL_Vt5JMCt&!vg2gR`Z^Oa2jf>&kNy%IgY|0Bfp&(GCgWDHkFi5|hiQb!`_rdd;^E5gb8eNtMp_sy|%UCVPS0&G?|^2BN<}E(!bSNJpghQ zA0JO(sP&>2E`19>zpd0>w{kp)xY*UtCW+`~NF0Pnnp^SDN8CZ@$ z$nZjoG~azT75Y&O*a>X)^lC-X&zd;4)d86{s{ zQO-nw$w<`IG04qoVi?wCb((@?T`OwCK$J2XPQVFf3Cw!|lL;*RSQHM)=^Nb7*N^o(R^09_l z+x(7{1Lz(Be)+lb=o3YSy-0a4Cmv~ezEPrcZ9NHEP|t3>E}Fd%Q+SW2=IG*a zh5Dz(0Sp8u=$d13^;Oxc#i>4tb5`C>wRiG>r$akDrv;a>Z@W%c$rr1Bz)k$~Qq;!3 zHqHqpd6B&s=`*61H74wLk--T>%Bis8gS53&9F-D#>3IoKJu#=ZPS^6y#7EA4>HSS~ zcr;{qjWF^G*fAhu==+?30|T{`J3IcP3VL=3bi&(BZ4+z@efT`m+&Nm&`RIiT(Uoc9`3I83kt$Nfx|=cmj}T+vp? zxk?b5$p1T|P`O)k`Q;ZfQd_r16q>`?BBigTAwT+90sY7)D2Vcj7I5sAk{O2P=3s3< zFE8iD#f?)ix)oeXd72fym#V zH$(J?mOPTQdn2GHEjK{HAtF6zmXusrU5!&D0{G_GXQo`Xmp(v0$>nOM35Mq7=JTssyAm7z zx4-!vxwwesApcQD0+Wn30IPrn({*{bXU72Xvi^Bi9G;B(@Z;Dl+*)hy<%)T;Z`)-} zXiUR=Lb?^qzFNPs?f+z&j{M3+YEfk(REC~wP?X+Y3b$V1@I6AYgN@jpyT1-C;|Ay zQvld?Q1DSqTmn!vk?qvC)}lS2qO`ObKz|5F%DYDumzm%B4Iwn~LH%xxlDBTnR2f~r zJ+{h?>xPz>&4648Vgme?KY#ucSh%^l0lpcesso%IgwkCCSP#7Btwh+^`bS4cXJ$wV z%k6>Y2RMsg!O_YMYiuwPpkZMse2*ZNI#>6=%<8fD8|n(qA#qtdIW(Je&PeHxnhwSv zoH32smC!d~3w+1Cu*^wzNT>=1()Fyr(P~T(eOHJx#&@hSQlj$1S6mnj*nUYKLGMsB zCyhcBa5^n&=n1pvr5QzaWng4eVJ0fXJ;4}fflir=7L01wd@);ppXU}zk10zR1WMj7dAEdGD5=(2OeZf4{r^DwfGl=IR06Wx0;yro+W9v7+U1@P z8#}tsedi`zi2i5w7J|5dSXID}n70D>Dkdg|oSb}kU?Awv+>|H3fIur0`cKwG=CJY1 zh?{IPFDEDH@rS^6SXh|Ocg+Lv!7rY-2AEqegpdL-!IUq>YHo4y7FcJ&^9njM*(f@l zv&lIygvXe;3R=~<~s)mV?d8m zQextg(}a_409H1r2e(t=>CvzKrF9WhmKJ$y!CS9Y{%%8UT}5+@qefRbLx^ZjU3hpe z$Z^m4yaZBlz6m)7t$=f|3cR__RiM5AK=eRSker_068IPZs4K9lGOPw&g7qJ$%|Ic* z&Drk8{{A8;WT6g~R&b0^X<$oq!-2dRS4scq2n@A}14)hkVsi$`Q zaR(YXK)3-lknuN7;2F%#wE|kbxk`=loYVDQY;^R9f`WqQ>)-&40M7Xe>Wr%EI z=LR0Nr@@2XKJWk)f=W<++YyTY3iTH(sHp+n#>n3qJ33_N!^!@FGX?hiyu3WzPU<*6 zk?Vt=C*S+=_1bEN@YpDPUleN7N&A7Ol;7YGgK`tHvG5CAstoY8&x*KFQc@D6rlha| z%mw#rWhGBgYh>hr0qg((Iv`u1Jup!4H+VmxqN3ivX{UHk0-R+%WYd%%?NqUd%k2Md z>)#74kU;TiM5~>C*IcwIs%O6e6}#ZXT=4X+=JbKUg8)gyZL4jKid<*0_HV-*Tec); zulAY;$4uU;WXmlKnLP2kG+h>brSmpz63uyoVQd92VtG7k*^7hy4=l90pWKh>y}6r# z?qTxrU1rU5i=)B_&S3{Xrq)s}hhAnSvr#Egnyh{f0|qVruI{y{tE+(m|6%;Hef+H2 zvFPr#A@4*|qFm3f7u#$AJ^Xh)7hB-2fHhr2Sa_x84K|-Z2itr&u(@}C#6>11Ae7}m z;B-<45pV!?kF+#j2PVY0*jW01@-tA-3Q)0Ncw?ielvFD?27q3X)&9q|3OSM)7FxWy z1VT&jlP#J(-vUPifb4b*HG-@#$jQh)>NI<8NFV~myIHlR+`D%?1qe++nc~3RfEBL& z>Xj=H9)U0KUo{3H&a)S4Y9XPafFS|@j2nrMiX3{bOzsP6pMYoqJn)x5ua1a_=(=Vv zO3`A8tMCjUFf%J6(5L@lXw7(I=K9F!6`4D0+KZs}T-qHSw{w`r6qdUnWkT`!bF~*Q z6uP@C`2bxFoF25`91`YC=KtE29b+ci1>^vrh)|vfv0_WzU&M$cdS_C~C|<9Zrr9tu zaBzSiVZ?i-so7aFs%!vlfsckx;ypJ2_>E!B&EiyX?~m7&a&V>*DEMtKI)tmm1$|t} zHfzB2LdvC2j|2zRBj?}%A~r3yWIcFU0Z6Bl`rueo6VJ9*Oy)#RDfdnU_?`anr&%Q8 zq`snQgOFhTP+HzPXC#0#6a8383ZW<0M{kmGW&+pWlP9;&tVcpa#~Zwj9?*ou`l*Ae ze^%eY7IchCnuz#cZ`Jk>c_m66dadj)?N$1V)2+A)pDN8>yx2EYm)dvOw`m!e*4MJD z{AMevgl>30UvwHZSrw)@@GXBP zsrOG$0^f@?IUbF1v!<&vG#^^bTeOPYt{Q6LARqaFo%~fh;y-H??f~u<6a6v3I?10J zqkvpGD?|dkwdCYvOHZW3kV1?iJ0QRXj+?r=I>g#FYqaN^;4uadWqgQ;rI;dG2AJkX zfPJ!U6whg*qoV_GK#fgJhzts|)FSUt)={f@d3ewih~q;BsmaK;U@(L&UOveJwChTf zhC={$gHKvSNC>qabVTrHv;bKfkO%{XF}n@ezZk-S`(b|GkbTXYx4yg_XT*q*@H1d+ zLhhq4AfaFj5DB=u0Pc@{9@f*k=d6uebDfcX=gUAY((92{H>#ZUzxXX7~Zd$1^)HF20i^}Wkfz*eP`}+R&k{AQU{xiLDhp&o`jt+4(jU~{af!q_&&vTA~jpmLWz&t~_+5 z*TaEKJNbA=cv*@tx5gbotTq#&2YuhUNf*M;>f8ey8Tq@>{{=WMc2E_F!&Lj@N3epWC=w*JDGlrl$#fmUdz0wZ|8|L;Z{kePV*&6RAxO4a zQoh2URl){>exc*huyb<@^6?=Apuwg6uJ}5Hdq7q;U=BVte*exU7zHj)Ct>Ougt2UK zF9&d&1o`OelZAFx*VZa1vsvBTgFUr@mDCw5Gghuyh8*Deq!RiwlW}$TVJ|~09id6a z?+OUkTv!Ut0|Lr6H#gfA`!HS2=6m0q-rtUYNpxXS3GaxaQYJv;bxrFdb*e@U{LCBZ zhU?C}+V;fyZBue*PB*rC;S|sce@XOEa4oi8%C9Hbnt?v8TkW6o3V&g9EGC?I$^`#t zN022?lN|y%ciW?LltdiQ;Wv1>{Q9-P(CZS)isXmidY#*~SBkiH$JfolI-%prZjIwv zSZzJZ4<;0$Y_QXd%A8de#;8x(IzKIY{^QMmSG)=|4C;w(Bo zQI>RTr%iyk|IN+jVE*PG($Z2jdHKVwZSsUCWffIbi5VHjuU;)xr28o<0_6mA274+8 zx?wzWXy|k6SL_9U|Av5qFK!dt7OQS~xz7*(jh|lj_i@~70l)P>2pTSZ*yi-$=Lp!k zI0B5H(Q1TdF%~E*VEzGCfB|y&$B0B(`dDfa9%g0~3hFMzUVs{b(Jux(A0T-GbOHW* zhNPw2K;@Op&@*A-_aI;bv@V^)GE-9{Z?qC3BMpDrYk_0Rybi8xN)`-epe}|<*-y}e zy9`)ErtI%Z*`Gc=n5&}WlX)fhRR_Ft2}Py$mRkVf0CHO%tifyE0?2D034!XNN^|^T!|+#7^oLiiN_Sf!|Qx-pb-F;xRW$bL{{7GZ3c;kCA_m3{mA)) z_h%2Hqg4IwL|OSJ=6ZW?yCbNXiQJ}`4Jau~wDJX^(D>CRVgg#)jolADLqmXoc3Z3` zMnrT>=E-C-jKh&@p&(TB8~S<;2x&Dn`L$Dv<2!Bw5N^XU{UNyQNFFcd*Ph6|N1$lC z1hk1GCv%aW=d%4f`S|6Zk5xx!Tkeg}*=zZopmkncV|~)brzprS04|AE%*8oe_J6f% z&Mkbqm=B@J>a>km_9dYr69%3P09TVjzL?i>%Nyf4R!Bf{x0nEoGfT#Z7uSn)U7A#tbPDm(|e);_4&H3Ye3wBU( zO&cz3d*$s%9xY5QzE^+oNS?j=TPHDFN7~ZTGCS{OVCFd5tNE9#=ea+r`lmyyn=#+9 z5e!irhP;iA$IHWQ@9nWTf!-xM%_lWAHDE$@_1OREebhCU+0y2FSsrj7%=WTU>cdH2 z;GX4=hO3TQ7`PLC`dXCdXmjr)lGjBtUmioDl5Q0!ZMR&?S~3t zxD8cGjyz;bB@&5VyTIJoy!JD8wc#7<>Qm#NvU{{&z({_2d@T0u2yKD38nbtTb(7I_Css)0k?Ck7p zTU-DO?(BM_X>k(u>m0_DxbEqse1tr^Q3}=rlw1@AnSB{rl%uY%+r?W{++?G#YvIdkV!vZYt+=K`%ht|GG}hjD z^<%jcD}%cW=jAUFYzrgUbOF79`v%jiZ}H0ovEc zzS^1}fQ+i5YaXrmoBBR~OA{E_Mi}dmg&5+kPx4Zss5_AsFF!E+fesBl>=kYi2Mbe78S{^Pn`AN$Fi43_dXgw4?X!pM*vwNG7=w>^szH++O zjqY~`kPZ_=LsGzWtuCuSqKJh$7g?pHXMg{8*`M77 z_leJ)C};rDBIkW44uMaBeq3bahS}oD+lEzfjZE>ws`mG|PntmhW1+cuowM^(LvB#% z(gJki0gn&wGE5fgsE>Aciz<)FN?vv@x8B-6K6tFO%lz5YHI^bWaYOEG;rqh|sckC1 zD&90m=(<7*2>=?3Wr5$Wz828NLCuCsaG_nhzh#Shr*Tx;D|+}BbRcX{m25FJ!0 zT1~%sjdTpm({t5^FB|SMevHgJI|>dB@&F20pxq{6I!Rv}0TFTIN1M$&(1N`VSuXX#Q}1w#DBJ^gMgmN4DBD zIkj_ypagmt4Y7k@0z@Qtev7}N(V3_=$Tx^qWzh^5#=AyWbcAO68A&h*?=FOV#|Fk#;sa)JXWx#F$BmKBD(M;`?^?FDGMr z-v6bq0fT4bceqJ$a|tlG z6KKi=Cc_pjZ+W8QR2F^mTgv#gm&)?E4f`&7PATF;!gA+jOL~v*)o${YXw|y_7m_cBjo}7FeO)wijEdJ`1CW9uEZi{o_ z-@na3o@s|Su6kD|P@ZxEQF5Sru3F{%J?!4-h&$KEyRX7seSocG6bM{x-90)|>-0%C z<|@)bM23XWT|?z1g!3ma?1zKR11pPD9P}Iap?!(V_7$MV&BFr-17UqBzR&LI)YoY} zK|P84k&4BNp#n6DDv}P)%Ka&MTwLMitBcLC->Vp0LT&fXPXU~SW1m-=w*Nhogu>v* zSWxb#8clq6(sa|u*4TMJ^P=TMj2eXq`gYCv+pKCd91v9~90GHKv9Q@R?Z*tR6k(CI z*MT0uwQ2hvLmvTla&yzPP(>T)g*<@8nWX|ATx@K$dfGVio%zdTv0l;XS376C zyzmam?vyTUc^{IuoT(iou-a$q5GcT+n}$hlqEOKjUXZXd(y<;Uv#cP06XUQz|H1gz z_~9A;T483GKkr`S#?ZL8RS~;}PxhQ{;t6LPgK!Quv&$@Zq!EP+ny$sp();#zvPaz+ z=9AK_gD{^RwKAASWqg0Spr($y!<1V+sr) zy4yCILP~2S^uPGJ)bZyC0@-mMJ9-0ARkBw?i>5{3|IcqpGPV|ZP#{Rl+dy0fHtIJTD!h-ZPaOd!;I&6^(Dmlvlk@E|BYyLQa&PVK&3rAL z=2*T$mO1cP#Iw6~lM;^B%ws0>7TipA$3v4dS7gG)6C%UxADXr}V#GNzI?n#t7ABlt zXHDDxR6Lf}KQi><7_!)It$R0i(BiryC-uCmZ^Kc#vArAd95=Gx#K%i!cUam&_tr}E z9T}vzG3oM8MrM%!o{vdF8I>e$%x_TfSpms?@A1_mMUu||fxZzltNJwb#s z1wCFpLZ#~rI$}3P*`sqJB9y_41^jBp#WZj8vlGixMJo~Dh2H|Xvnp0LRyAg)BcTmt z7Hi*jgVGAgYD!hMvZ}WB7TaJN>fijcJx6ODB`)QnjKzfZ2Gb99n122ZH{ zGF|9vv-sNQa8qeeP7tNiyY6J4^;W3)No#S)%#zFZ6xwiP!skvpovqB;yM9fJ48e~Q z29>K!n~AaTQGi5}c#5{6Etw8QWO>-Q z=C&C*B-kkJK=%x!_c2mIF%`R+vS9wR5PnO6DqXUileb&2X!mi~^KzX<4#3{xwK9r} z0DXg9(;({hI&UU@0oUojNo#J{Ru+EOdQwfS#3PmR$A+Fhk2f7_+t>KV+~a?f`i<+d zhEn%fTRSTAKl>#j8v40f_xILs)@yXMFJstyI`@an6x5IPEXzz6@?=VOMlaPszeJm7 z+ud_o%tY7l2wWux@XyCUfK}*HclP4&I2tcvuslm+60Vy+=oHDJ{`1LEV$&qn5;v3T zpF2Vf1Rwu#n0*GD?zeQm#qhtlIxOBET_&getR%v*+AEF{k~X?cZd!VD zpkHC|X}Q)+aBnW8Xgh===NxoC_+UU!^e5T*Ocz68Ai=dXf^2q1K!}Ep$*p~DCFZNn zGh1FIHWGEZXz6d#SAVx$oF4l4H=d>&i7+2ZA9hi0-NpgA2!g|IH{7Y3ToY!t}cP{(8EJ`apng)5Xj=B(9%usm#8sh?60ON$MdK ziuPp^E3pd1oS^)p%yf`_-Deck4=qMI2fcyszF*$LX)gu*s<_M$K$tc;T=ES=|Lo%U zt5T~5HI*SMBEsjIsZ8wcJ|MeX=6xGV>eZ}91cIOeJOz5Z>&VaV`10j$B`mBfHYTQ8 zIiEa-IZ0+ZF;VsKu=1;@0g$qp!mx938&$8xSf$f|G`8Fm>elACe)Y>}_-JjdP{{cd zf0GC(wse}@A8+11?HFIvZ_0Ihkt^>#zw5Z3f=rGDw7KpBjxpNTbg9wiE zumDNrHBP#uSaJfBTP>%9YTd@)EBPV51)tem$CGTe@4K3V9hD6@bc1LGb;p7krUKj(IA4E=`+;7wR2@NU0Btj2s%}(Wp9VjfUHDXdN z{?CKE)7*f^g^ivb2(P_TExJ+6XgnE8{>kt0`goOptp*khv_bRoryA5-?$?-TI!(ny z%ky~HPcJn*A1%6+i3C|YtVyhN++W1Il#VLTFZvM3vgcbA=py2Uv~pK(*M|>fVPWV*6bi&wvik zyFwXXlSri`sVcA^GeRKu023vbpt=ZRas5VZ*^+4MeRzcz9{cS!4OkxGal-)10GtQ> z)t6p2P&tZ+dAkq1L)7(gS3k9!7A}F;z|asiC+B2$?q!}h#c7O)h^CKED3B!Qe}9FQ z%8Zo7*D$v1Ee6;jXcS3s@IWJF-cR?v9qtR?{;Vu7fA*vg0{xGbPGdMe;}h zmUp{7+eNn8=Ht?Cv^~Gr?V2x9e(DZ-zTDetb6DdG7V9m|;0Ew;z276E|FhGRP1uIk z&%2sqav4F3Rh*m}<*ot7N2=Fxbp=i$}ytU55DLpkc(y7c2tL?WF zhW^B8Zxn895wXe>5AUF>kM;AGq%;|b94^#7FX z`jCM^F$MIjFrmi{<9J@2GO6)w!lnn!P}ppjch096`8esKmDxUWZTA`-9aSvoUG@JO z5X#O665wZ{S(h~fNhpN5((|F&Jv#HlzW+EQn29zSkJZNH626iiW4BnJBvY20A}(_B z5`4s_>GL<#BQO?Vt13P);<#EhSY)sYc}s{$Me_Mf`b*@dwMISEnI*?-Yx2{s_g7Dx zHX2}Ie>PD%BZD1W?RjXYd#02XDv>?n1<#49f#aT<&@1SEUxlIlfQ9>EU13GP5| z1pgQE8sU<3?lrqw?7w`34QPrRTteZBDXcq7$+t`6&6DuQHt~5yl0A&@2c|XT1sUFG zUti$xo4zRR4Dt`0Fo}t|&4zv_(o}~8wGuM#la4D2W6Q zwK)(jQ1RTKqwv18p3V)L?+IgGZaC-jd)OE|ARMv=RH3$bNlAmiJe&3Eh!Fzl4z z&CD@`_^Xdi&;HFMCH#;>l(DC?c)L0r>t0y7CKwB>ZOQw|7A{WYKYPgsBkzrP%G=Z# zOw1hcKahk7^XHk?$dNPbuOF=0XNSLaSXs(dI4V4mjG%!UEcn~?4vc>G;>hms^|4lg zjM_wE0wvxBqQk@X-zNH$#wJc&djF$rWX6a~A*S8SVjcw{rj`>E>C_<^?euaQ5*w(& zl!nC{yjdb_Jow|BkMPIj2WN@D8f6>rH2|`-qEIH84h50H=+E|4y?bZ&!*{6sCNrom zD3HVs*g*3pLRs!cyL*S8&_Q>1uD!Q?JODzr zUAb`pfeM0x4i{=xXAeh=_*j*`cp^HW$DVUtJl1~5Aj-5eA;-)bP9@^%S_5^pH29Xq z=<$IRj&bxzyhqB&Oms(TU21!j9~LW9-E`D4TFunFz^3az8#dDl(Dho_zRgt_46gSw)nypP4zP4(&V4% z!+{w;0rcgG-dBA{f!wFOpf0so(rj7ZcSvNNUiIvwT#T3Uqk&pkakqPB@m?|ZRbJ7e z9ltf0r}tWI8t-^^_kzdo+f_=1&$>DspF9YrG_G!!Rd_RQ8(vSfa{MHPP=^x#OgX*}d2=T5evPkl&}IWo@L zplf#Ut(A@9C~!+yI>@I?SA`Wj#~DPV$t5I?Xo1}D4tZ%v4h7jJiu9TVg%R|rE`E(J zx4~0y1sQaJ%FFN_z;@a64+r;>vHF08CWcyS)r}Oo-=Lxga^Fb2Z+|Nv?IHFTKSZ?j z`mIroB}r_ClWM7{DO<}}3Xi^2AzO={l{q!7qc5ira_Vq*u?e08eKO&2xvG|}EhD|! zzs;@LyEXr7{Qp3pZ@cH~qo)Q*#e?Pv`)>@Lbee@?>U)}LlXkl$IBx{4+hQ$+>gJl1;*8-NNlZ{?N zn&Jc?xk&B$QG?(}`}x#XPtTge68gsXKSG7j5dTCDWZ`bu4*C}wUix#{a%$}1swIxZ zgdd>bAYBiecj5e9qsROZhwN~=SmPSK71?XNc}P|L=h3{t*Q8l1_-SCqv%}SjN2mr* zM=^ELBLEyJ^=0b*DLUjo>zp`LzF^~9mOZDk7M~-3UBy%R8^1R`K=UEHk}xp!eLY2m zfXU`mdR1r$BdgHLqAqPVO>IJc#&X{PG%Q+-ks1eDODm}5_CY@K*5BK+Srn+*-*jgT z*r>T&!TNLaB`T#9KCVlx{QO?r)^=WNW@2l1I77%V`=)Fiq5J0a9U$8Qa+7v&!i=;o zJ0fA_lVT}}hWx3gNs5zQ*^c*0GY$~_Xa{Lm0J^10RjyvWVwll=nmOOcrpKFY zXR!*rZn${f%%NEuQ%LL>lzPLVGKBwt^u5PScG`^Rvx_HDW(pQerLR71(JakjKK^ja zxkZjzb13;{HR9NKaueeRit>~zo#OA_us@}`0Bq)L@o5z88!)Ynip5?F4r@w@C=B&8 z_T%76CI0Ynv3L&;8Dc@&dC+0rl?Lx%((%o#PU|*DJTz)fAr#$+h;j6eqjIvzYSdYK z`vO3lD5!x=-My5B3J|v>(8AMea> zNMFe_w2|Z>cDLQ|-IHCe++)YT9xXA_?8^-5+%`c39&Nv?@eMByi2xpp-vc<686yL% zv(CbpDE)M|-&L&G<0MML-`LjD0-Gu20U-f`qz-wwR@c*)=O;fZS-!r%V)ko(jO65f z^G)A#42PGSjsdcH>z}3n&55|K?l7x~7XVYs^Et1V8u){dn2}LeREnZ9q_8zRys8Uj zB|Q$8(8%q~vdL!WjyX;< z%q$G#LVOOE;-Io;&X_KiHJr#^nJD=sJ{th<&>Ra@^7?%(Q=2ekKi(P-5?3|K*~szx zG05(1W8KoL(vlc0-XxJtp~LN3+c2?+h;)3qHQTLQ#SEIJ)y}i_rUvGtfvUT8&DgwN zP;PqN?$(tVnttX6Ew%WJ(cl>L*N>$w8TrG_L(TF(7*fW@>7rF}85x`IXd^5mOIX}< znav8S9Hoh2%a7V^-_wXJ#z!IF8bA;Qt^1dGLQ0X$XGst1JwcZI9fy3b50dmr!R=v54Db=nEZ1Ok+AagcP4HpZGWna`95V54y zP`3Ox+(xsK;P_ZrOt39&aH=o^*pkZm{>sWNq{S@=v2zqWtJCV*+8Mepj`M;Z0Y4j% zV?of0{Rs=G?x5r69+J;)D=KNQG-Zx~%jjTxsU;--T&9reXIYik91fTwJ`~Fb**`PGZuW$!O>O z`iGm2f^XPvtC{{b+D)mNm&ZE84|}RB!G|=h+@*7soisVaWYyk{q^$T7mzv)uLZ`V; z*WSY(Le6)3^DNUgouPu8;qvB-+;1*gIgs9{vuyP4HsqVfi^{kBiaIEl z4>ys*CMbm}GAp&u*kyXE}1G?6$aWDtr{I9yO_jIW*4DVY=caGnTQy z4LToknDpBLq4ECWl$h&a9*=yJDE4QXBjQrT~VovVY}uxva$j= zm7gANm*3&W;IS3pb_#J{uwVQKtH44K+(h&tv$K^z?!wMcC9^CKgF+JSlXKGxFsCWQ zMXwX>C4qWMH0N`UD3Dz-cuH55oQ1poK=E;_|Gl+jvqmcNbvO z@2}VT|KkYM+T3A+0z*|=tn#ZfMEQV;c^p<_1@3gEv+NG+sxo{oH17NijW5i66~N2i`bse&?@``{5&}|F{?fzpR8d zBYlUF+Ib$O_sN_b=C-lLY+)`<=eB!I)7VZfZMLH`LQz-PlV|%PHb71a(d5Qndyr|; zQY5OJuJPdcp#0bCf|7R8?KzetZ1iCk`QIF&+3bFu)F^h9OS^vC?2x*mWpcInd+M zb#<6wyV7j@a05iXuA-uKGyA!rJ>HE1QB;*7AuP1CuEQz#zIU!-C0b~&0P=`si@2Tc zFZa43uh;(eski7mtTiD>p`Pr4fnUL4!1h*IN{jE^)BUrr?P}ZNp#N)q2HrJZe?u~5 zR{}_*tr<;(xnS4z*+sUjOlh8S*qy=Qe&&MTE1L7>fAAq{lfRp;{*diE5*+jI;&zD$ zfn}T{@&L6XocMcN3dsR)RleKXVTr?ioJ7BHfMQ9Af@ZlT{4$(L4CY@Wr*Zf=jRpx)+H|D;{{JR#3b-G&o6BqGJhtHuX@msw3F-Fu31Un8m z7D{Bu8ohrvAr*B|XqbPAK?ix{1Hgk^p`-Z%Zhs_i?{crpvI!1ua&k5@vf1gnKoz<+ zXR2mfZAr~}Hge^&4)0u2VF`c=&6P<;0ouXh<5{z_k9b0X z{dEovRpDIOmU2~Q9bYtVIqnN92>24t;a38L$A}hPogYHiJ;p)=T2E~k|yu$#haxN>_FQoZw?S(Y7 zta-^nmfm$VV2-Dqgm=Iq;dV)2sovWnvVG?W$Z&9Y)g=WaSc3_+!y{03ch*(5C>Xy(!o5 z^Yd)i*j~%VYGmyG7}NVm%0#}A)B@RI&y@6LBc9a$%j>M!R)=%H#I|;?i$?#^8iv@> z7Wap%{`njY;;ar2x8LE1JI|eM=F>^UlOIk2PXKmtc!)W*$?`}|ZCzTEx2TFOiny{Z zI>VKTfWq0OF3bfjeZncXE~DpXy5Nf#%frRzP1R^o+4xotWf0?p)#h4n+$% zO&G);#4OEfQL+$(cYup%G-fTk^=NQEY<-K>SFWZwqm7&W{!J~DY7yX}g8+3Ax2tqS z;7Sm8q&STj&F%dtP=Yhz$Sfz--kOQ&?w}N!8Q%i5M-a0hxOu(FJxCXbaE!Xnm~YJS zXktASZhxAoUiw!UW}XSbxq&XR$MPM$W*2D_5t3i#&YZIYq}HV+)$;_HYRJbwZ~e|0 z4@W*8q(?cBtE`Ud@;aa2`d53+xZolq#M%q&vo?NRlkWNgcm?fTpHSMb&rA+V8ZU&F(yaH8q#TF^Jylu zFlHi0%jr|~{HCMkrtOVfLcm@CZnH6798w5(Dn(al91gRSk&)8(c4mrMGynL<$$0}n!QaFR@+6^+zV3&z`z4u>EsEtib6tnoKKFZSK5l#I(X^S7;0(pC0)P^#P7n?UMaf}E4Hb};nnE{7tVv&U5VnImqKns296p6BXd zd6+|L&QFF?TGelfy`zsa-)Hgo|6Xbu7hdAgY6T1vEp`WiA~q(%eGV$yvc zBnP%>pwr7jCr#cq#bwKtl})bjL%PyJK)24P5?~)?(^Uliu!jI zrd=c^&9U9Kp<-{7HOOdzae6LkqVk{adlQi9TA4{L(y<>QdM4WK+7;_+D?Ft}wpcd!n0V!4&5d6y(2Gs#@X#_+Z8v;eE4T3&Sw-Uq? zoNy)gh{DPic)(|<`b2QQ72#io%^U#8__{Ce6oW?10ZUq!_3TJM7MRJ3&1Qdjq2SyF zao){?%k@=vY#%pkgOQlRM&}`es^34 zTmazuc|}`$Qq0epESHbA_J|l}`0jx|Ej`^#zh!-M^zCydqlLqx?V}hMAz~Dx`;Dk- zuAZsQpQ+_Ecz;XvA1JcQ09Uj4pVA*Bl}w+A6BQvsCAXkQ;IHi8dr1fkb-0^uwOmr; zr;by{)j>iP{PX@FV?p3~F4y>gSC(KfMwM;6D(SJ<+v!a_4o)O5V5WnE3tG5fw1>fN zun}CAn&d94Q44^}goE~n%g7P`I)zSOPqGeLr4&6p$=g(s{43s-$A}=r(?bK~2;h7y zKM=Yn&yOU_3q~rBwg`EAR^cN4=kQy+oXxgxk`Y=eok^nw=<>cc8Z55i0o^YWG2)9y z1J(HS@!u^Ot1|#!GQ2x5|2xcrN|VLGc;l9+u_FXOXPC>?DS!;PVc>}So#MboIXG^cAF<*Y@X{mn^fDZ2?`$OWU&G-c01G)BTtGaq zW}#Gw{NCrEG!9nl95c57WGuO-Tf>bAq#7TthUMkK9kew?F4_pj3(aW5i%^7P<1G)Qar0T)nHv0?sB)=%< zqe0lV73T8@+mlf#R7xy^@o?C-M4Rw|gZcpa_y5l3z!P^l?*CMJrV)FF7GeGJ4&fpI zRS6DT5V%g-r4)f5+C-QcOzM!`4UOuOF&Ns*S*d_=Q1<(x+7uWF01xx0o(CUbMByl2 zcM}HW9h!gzL@85Q<>U#!$9<*PW7tLuhQ1tz2PgZ<){fzl{&NzmSLY8alyK}aGcqgS zJS^ldC)+#GY~-^oSBr|s>ngftfB7Wfe!~S974822l>q81^y2ooir0loPqrXd`W?s|?jns@Bx9@OIS3QqgKZygR5?u9~1?`%P7=ote-<|zMYPz>nlyM2av^E1$Qh*ANG0HY7a%Wq@ zzOOg;22|@XSL@WWsbp^MTl@)>58?UdoX^JnbTx1?*AN3A}~>PdLfobo%{6Y!Z^D%T;l0ct5k}Kf=2ajwKcaV z?9x`dO9;&Q?)G-Ks;PlWL=gAp*9}yAwIj0CRU|&%w8c!;^{M@csk_^9nadRl_jU&l zedngu`P5jgXr!r22@dWX_RE}J?gRDDo!pfQA!td1{M$7Dw-KOMVn8MRb%T=HmhXN? zue)R@&Dmz@yupOiOxvG^UQi?N+vFm>x7rVpvOaWC-R}xzn!ORp*Tz#gQP~9K5+RH& zu5XZDG23)RdR0%r?+%O6MHW7bnTAmBYHNd%$mA}mm~$f}HC)A0 z=X6-x(T&8hHM0eu3X@K)J{yY+3tfGD>~BKxJ)5so;%a2PBdy^08=?3&IjZy7iWcRR zZMV|c6m!>1dj^9j_Sl+4)!p^=NljOOOsD+daZ5V^j2Ch(^75z6Y@scKDfu`5~uKGX3B}yN#V+O z!ze+KP1#Zt!0!QRoe5Obbq%VE)a*{~MjS9R`C()@S>;R>{o4nYSUS+*Q&^W4&}4vKu3Gdn zgS)|IWo06ouz-Z;t1aW+Y6g)&TZ4^s_P7Ey1wg`&cwZ}J@tMk3>ipZCCL%>p=wdfF zg*>c1D>YcR2=0M72T$3px2LBjLeVVO|BUEsRD4OK@vj-~dd^$#QI?4~$`uftJcY$| zyn+Mh4xna{j|oNS4L`j!1ZtLWuK#M5(K5-JL}p(o*x5~J=f>H9+5%Cn7P5O-QYQ<{ zB2Oc2HaSx~jH|RE)yp`jSpPKV`lXwajiLQXN8d>?sZdkJcQa+H)YR+Nv(6b*|A;mD z{yk_@jug$~waqG!{1cn09CKN(yH=7kCRno@@sXy7v_G=NR$~rJnKGl)QKxfqvQk+( zcg!H{PXdzRV;qxeO&24HCrOfAV#1#&G6jZ6?wY+^ZJL(FH0=NhD$2xlGBJbE#5;{zW`eMvemr~dN zQR}K-zaz!yklA#+XqRgS*$^fXXpTzI@P*RI>gIUe^Qrgc)@4bRI$xECyHVMy1g7yb&&NTJX}T94Z{r|D1KF{%cPASwLj! zLJZU{HaMj!P#8&f?HuM@wcY9o>X~kK;wB%Yn7TfA)Bu9EG_B@v(Z0uP4PJEJte@t|scOp@0$><>dJ*v=p4KTnP=j)aa+oy)3>MNxwc!==ujn4^X* z2LDDtvvE*a3=q)cRErZ39!h)O5=m~MZh8qx~dot-}@|32;3=59HHSNpK}cBw9GrZ6nbUpcf~b1(*fEQba|zGpICEeckL1aCPH z{nBYR9P4NH3hwv&mq|1TYA2QwEH;k%@I!7<$}rgj10&<{bC#P140`CYA@3+gZ@jzz zuUIW>U#&5@u4QZwKyiFo-pq;g134I3F0Aw(y4{!PX0pg`HeSUm;d-9jh&7iXND`bC z;8l(4DZcXj-U!53)oXvH2MiWTlT_R)%e@m-S|?A}G{ZZX{nQMFEvjYBtdqIgW=k2b zhx-}SY5Uh=md;o{?v~f}(ty>+jf@15vSxFpHZNm4{79K+KWFnzrGp)xY!0WUl(|8q zD6&U>chT8c>p0Hp@{-fbixFYWvHZPP)uM24G`-SdB5Ikw9vg0KQ~bHuK^h!%ye+L+ z_9qJQ96|(7blOGUOg^%?wVXTI)&NWJc6IVqW0w@3^AYhBZtiJvOKkbp=^PXF;+a1z z--vsym@FkbD^Bg!W0@_8VGr?mLQFb~!e|$>#lGjQ+o|nHiwA$|XN&G>Po;8_Ub)VX zxm|{3qO;@E=HBd*S!FO+{At+6eIhf~ht#=qCS$dGUw!Z9eZ=GWLCkA^gFSz-sG+gU z^F|MGtQh+fgIgB&p;=_XaT2EZw9{&bu9Sz?3-Qy9zh_q#Ypb3)5j=MhVcaPL)t6N> z1)KBHux}jQ_c{e7yOS*MUh!*JAn$G)^c;mD9UJc)q*wdGY_QpACZTJeX-g_N zq!&BQGqy>jiTH)!^UorCmzXmEeLvl`7c+0tR%_rIlKN;LGgJb z0N>a|dW$Qjp|P=`yM=m(lqbOJit-)L1F3L~PW`80o(ILW051EPqaQ&Yy%9@h{{HoT zg4&LbpSp-Q;fqi2E|rzyu8Pv}tUqY08wp>aN9{KexCnaqYqS;=P2%VCbMe%H1{CdeDa>l4UR8(RspC;X#--f_^ zUtr0G;BuQttKIkOlJmN^*G5^!GJhG9#P|JRQ2wAHzhJLPTxGy)MM2yhTLvrrwp06s zdTr8JE-1MYlw8M&nL?Y!5TA$LlD$DUD?hrQ`hMYEZ9_2bpuI%6^rcy2$0oh$_`IsX zfqT-w4ym!hMvT&gd?h!V?6yLC37kQkdGcOzg#4~j-VAFs(Ql>8mhAnHBe|ET z9ED%XRH!o%uU>03u0&3&s!%Df9{Rqrnl z?hWF3D{FXqLO3-Vs5?C;X0gfeumOq2=~7H~u@W55#>@>xr;ZVIZ?uTzy=CImRrPrN zEQtNf%20CJ_K{lcSvEm93ajp-UQrTLV#})cynb?ehN!Dq8Iwx#c)|d=kPc56m@Yd% zgE=9H=ny)(5^Z+7aqGffon5Z&?aVk+;sW?wCmQh_=Fziw;yp*Yo~e7%Y8EL)VR9L>nY;yq z+>PH}fdzHjFqy}*WIxIzeQIga;XL&6k@G=`uIMIAm6>ktFz6T|xIX~+V=}{OG!o&w zUo7PIM36&J)Bx7ov)7-{jdQ15dY9;i`t3eRipQ+yv<6rCWol7;PCFU`IQe;& ze6eqE6iywDI(gzFjA%RD+FNj`#y8BCKM6D23qja#y-CiPvLS(+r%WoPCAQ4=5C>;O z&oAAsh|*(&+hEGkG#x92V^m=;p)h2|W!$qa)7 z7U{9;1cRGE3`VY@Kh-;vzBr@4HaCjss^`0RXv8ua;Pm+Rh}e9mYo=1gxn__uzn0FC zBSE!ZI?*C9@r&{M*aNA#c~>ms3n!vQbDUP^yIM%8sNVJV2iO0?C}q`AI%gxB*X z?n0-3os_goh3X6|V7oZ1(|2+#E1jamqMpEV0&cbS@~fc;0PhF?AAF8N2QIyvmB)nk zY_%=E$zvO8g!+|04dzpR`<==J+o|B_s=^8yJ-Aiiz}|`5BXTiDY*+O$oQ)yaX|HdQ zkUu=X&F! ze6o!@orwWAnLfU3`)tc?9`-3BsLA|jAD?{y*%}|rL8cmM_8WipPoP@yu<0iC*eqzV zhr%UjdsKRs5BG;WE^0is}nw*?ex{1kTo5vF88^k#GT{t);fs>G;*XaWW11e<~=;;x7Vj;z}Z(hIV z18}w>z_7x>NgxG!JSBZDx1%`R6nkApeM;`e@4`hGjVD4NyKkGWzdKbhEv(TF2RwalBld)q%R)o}MJ~=D55?jF3%i*-`MTgrq%F zY81Fn1A`|$CA7+LUWi;%Q{`(6>1&my=ziYYCC*w~J9$6s0lW&hE{l^Z9vTbO=M;4b z$s+ChF+5cpbG(=1UhO$uz9y+Sy7$#tP}& z<~|YQml8wdC@4VHI^ujLHh+8L!G|=817~^SOIL>=vvLj73pW>++DP+SrsxY-%mHS- zPs8$HORR>z$$V2+Zuxgt@f6bKpwX+w#wCSSlRnGh`ymV;l~Grlp3vM%UG{kk_+ypA z0jy+JS6-pBW+&5JS1Xko!=jN`FMXgcj030~Z~iyXGwDgs3c|Wv_EqjQ5QvAo+Z7!nicgmyJ16f+Lb6nP*>M1 zT3j+Q>hXqE+n`mA|4v7O-ND~~Rbzvd)wG~#L8~@1zKR{%TyAx?T1y2Wzy}q_EtFmI zs`v&8K@U>lyuI0mpjMbF6w9pHE4kiVlYTJm&|xh>#})0NddXY^~uMPwuJgczq_QUtP~)XBb|8o`iRPRl3l<%R6wwyquTp&N})Tt=c+{6 zw0k9Uy>`UP^d1`23VnaeJl={O)Aw|Bxnj@mY6Z93D^MUNXkA z{NnzaM*UH0^^`D&_Ws|NSEWIvT|_L*%atSD@^iWSq15JFl4n19LjChCl6-m|+MaAm zTo^zX_P+U#iu-xP0){?PC3~%;lakX%L@^Ylzh%5OYBDia>Ip3{zy-Gbsnos3u^3DX zj2xyqoNaTv-!N#&yr_MGVI0b5W^xjbadJ$Dn~k*SmtuxbYcq^+Ix6*l7w`x|vdQOH zc^*0es*>Lb?@&kP^M6Z|2#bSJ&do>bM`3lXz8Mb@@!3@!bni_SPz_o+c(W?)L<}f^ z@hbWs*BY?VBNcRw;ktwVww^$@Dd7g37`0fZjP5qpqSYv;yJ%x1A_^J>do>PPCS<*$ zIk`O_j8UOGyQJ!<)n-&NTdm2@US-gf$j#;b2$$pW0OIguOW%6ueHtjIz=RpURY5%L zf{PB0~1~od+;6Q=N(iTq$#EzQ25QrTz<6*a$B=2!|HD{Zwt}Bhk=0p_(v3a|BQPMoDFHJNCXw77 z2sVu1Gx{CodUe~xzS5-_`f=X&wx6^_G^CnmxG%Z@jjQvAlN%M;%8K4N*G=orKV{b= zBK+T|hX2$mBU)X_o{bf=;}9XBYwOtoACpBo>;f_`xhVT47rmugvq1x%rsZXes{RkY zV?@qW!D}Qu3l9A#&xQ5JXJlNNJF=vAu#t<7r|`*_ut}WGd{@1`+-GT{e5np!R2bk9!U4rD z8~*N%%fI*9jg8cYEqK)D2JIs|iE~=B<%tr|$Gi9xnY3gatMQO(q7(mPh}jjP^(sM@ zV7b2iT+|3@_)Na-T>9+f(4F@^Yyc>Zjn)BY?v%AX}KT=mP z9t@nMZw{Qx zEWMi+^oEX1o^ZS2Ma@9@PrLz`bD;T8?>j!!=%to;@z%gPHBb6PT4it!2Q2tK5_svH z&mMU3vsLT1)_eY?8nPXOc@szAr6Y#Pqk9NBK81m~qHR|i7?$ekTx2aQ4%?IOuk%R+ zr7Wv8~V0xz=7Z;wwLobvM{OKIf+0RakY6rfnV+^mZlH9FVz|5Mp@ zM@6wTdz2uG0VHQhk|gJ}AUPvxNdh7{XIMZMB?^)e5SMUa$-98Wm9Qiwset6XfCL4U zoa38?``z!p|K6K(_MDkBv)x_Q-PKiHzv{Z4F)z={F1Ce27KVwP>sT+Uy85rhkOO9} zU#LL78`7faE|8)4(Sk7ar}nZr6=-sIB|qS(E8*8^fR|=&qUPPfMzMOz*|(~y5uzY^ zP}0L^El{n}W)XB-Zi0WgWM%;AefG9=4@`m9!J#;9gE-KfZ9Bls56FY~J*kn1(oPcK zUQ73>1=c?SY!52rLgYh$BS`?S2BvUXIF;47gWq#CKu0)a`f zX=*&X9>NXgcvG`$^g~S}vh|uBI9FSJp7U$-c zXWtD82jWdh4sd(1L34az(b>~GfV&b0Cjh$)NidfkC5^d&%$UGKT1zNpsxJG`g%`QdqPr12TUfqZ9Xvz)d`$?Q#g_=xSy#i)tWcNceGks7^ z!L)a%mT;?w%4{F76G=0y zP`EvpXO?XC;um5`i5e+UIPWz#rYbcvGHYyhrp&l6jex5Rw7#0m=md8A_W%3}szLC1 zVG#x|2QKLYgaT8Te-6lD7F0F27>;S_ch$&DIrz;)WLTOA9jhn2tNb-BiD^g}H`V_2 z8$LzVY{(#=cjiXZtqR?S5(GTs8U;&QH&R+)4uTN-t|?RG!FvpBqV745CD3b> zjy`U_^p*iO2jE=A;l~NX-x#Y<%Z!g^`Z`nuXVdVS7JL+Y8_Gbglx9>WPs+aRH-4~@ z6m?@zGjX(>gzQ2Y$orguCq|$gYdr_V3 zhlfv;6Yf9u!ry|L2s2i}!pHZ=OHR%$xYSeLZ4jHdPta-fQ38wqpO|>M0p1-*e(zDE zKdY+q;R(FfdEo4Xbzh?(BWV6znGj?u{HDX+YWs_BrMv%8cdkS1bS=tQ`HjW%&TmfRjtD z{V+TJgctrEm&Isj6LCY}dA@vamKI-K6eA}f*$Als(8K+tWqRU=bJZPKz{PegcBJ9( z+mTs{cbi};Gcy$3nu92R>3EK_dKp|{^t9 zek3mm!09I*!j|J5k4M{0I1E>hm+;>LOA>(eMjuu_1K;9oe!A=iz=3719Ca9fw04>&mC052h#MwlXNLg>M>|837?*0HGMH2O?h88M-I}>j%Fke~jiD zuA`)CPz3%a2VngO@JAONjGts0lQ#Vy#NO>28`KP?Z449ufElKI6MJ@ZBm1UMElgad5!TzS6&O{%Uf;(y;H7d z9`_bIFfMug&$N$W&jhQ9d}k>FeCx^Kj)8RV-(!Pz|0TpO4YhwZSpc$LUi#>ymepc2_+EW4RC4t zic1|)Kv3}o%KvAZ?o)q;)*x+1;fC+g`ALc4tgNk6x5c0FU%w_O+V%XwAuiDTn?c!3;-!Xmx|)%9AOrA z?c;x)2xy~(OZec;QKw9cg2TV)u>?N@fOY^8gx6g^9%UFfYTJeD6uOK!hP$bqrF*es zXGQCS%W{bEl9H0pXms#;9gwmi;u5yPbQfF11Hzj5Wm^hQH}G4d!jP`1C8N>IHd>&U zUbf*}y}v%Y)u*3E`MqVKY|NqI#TU{;AQvT0>tBBtd}!zme0<|lOvwSRCxw;?>AYe* zZ5V%?D3d(n?|EQg&&qQ^Qaqg;F7Dyr5C;-B<<4eHjCypeGyUwXA=|kTk@oB!FgcDX z1lt-rm(E7g>FvVmR18F8HEQ0)wJ`wVL+%2=jJ9Y^dY^>$!U@~yG!}E~P1+a$8MVt} z6Y6iG-~TOn>b*zSYEqwUj;ufr*>4g+yzerGitSD<1Ud7L%fxU=TM2B0+y`iSfENyA zdU5^3Gi^p{Jt!qJSm3@Kub~KV@+C0<20R0PIK*C;5+D@d!7&iTiSuCba-85@14;y7 zcsO$1m-B+~I-dJ4AaWAtW#Xl@CdOlhw1DITUXeG|(i9nSf*e#fBQJ>lRluuG;`65s z#|D1kI(aT5PWb6M=0T(NjrlfPc}i^zkESiL^~wX=*8tJP`2__gWuqW}AWO0bfQ9~W zgXjQ2-2Jz*xVX5xQ3{BWD!&I}4rq%U7|fZUDpbb!V+*#m;aDgT&^J#6bMue$dRdCw zk-OdSo$4?dU>5x{>&DyAAI)AV5FvSpo?8U8d#>R zH*BR#0M|YkyHm>SP;+2#yl%dB5!7EHg}3M4?1|Z-G7cFX6Hh*X(h>YtjU1<1-?>Mn zM{dl%Irj16dhqRobNiW~rB}`*AzcFeT*WbY1E43as~~?_y2X~-&{8g%W101hiNf`> zb0Qp^TrQhXv98!N5ntY()@J)78($~&&wUxx!(}s|l>(trA)a;qtVn&Uz|y0$F8y8t zdf%u5mDLNE9S^S8G4E-ur3Lw5wZ1d;X@gR{DcMF=iZ0{v%o?H5=iE{inT{Bvsos(Y z2zkwS`-6E;J8wTeL$!=U^;h`UGNSbK@HE~+?R5hK^OG7rg@FA|*YM5WzS=Q+9BQe{ z%iE(xIHvDE?lsfAr5A)3bbC?%bJR+mrUrcu)#Di}_p+TF9W(3LCN<3{BTzMe<1qMhfI17Tr5cPZbgId-Gd}`H7V; z^fxT_*5VT_j&oe4#kSzhud6K4g@f#ox~M?bBPmPc1hIq2>aIDXVSz2H#qC(ZLB4Wwb{cKrMhWt6 z{A|0?kV+5n1-V3}OR;mcoJ>-aAFS;H)$!0;!4>VtYPE0lUp;*qn6enUr*mx#jVg{Q~4#!TV>M`uIoRpyJT!MK+)?;Vh%bQOn$1tdZzD!@M;PeD74QR z%hyt(6)`I;)S_`JT=)1mu-Cq|JkF1yq5~amP9wKsHw=Tp8-eINrMLJzjan|auB z5t2x4sO{z9<#tAT#t^u{0zvG{_9=SCH_x_$MS5qNbRI7=ML3d7+1mhG2l0CXh0VT` zMX)PswiQOoEi}ZO&r74TK%^SJYK-y9-p>do8}$iE*0x63R*{L)+xtQP-R|q8Kl< zu#xg2CftKQ8ks^ZG?{>qN~d=e_QWmf;t4)_!LUX|G=-q!gIbLcep@Un=2GL6dh-ZL z?V3?*Z|4{ny+wT?Ns!=k$XN9T_%ln~Oh1FW$D7hL0cuvtawM0HT}#7D5F97g9>hq|kal_SM7E(wk)Rs_YD= zL!`Cuq3_I`o7`}64(g6g6Czj|5%7c+cZ=39Y43uCYEG-t=Y+`6BJ+j*fGtYdez_r*xrCO~eGGz97l~jun?CZtF z5u;8p-t3hh?a3MfY4%-Eg7tJawW`6?qrWr4(o??@Vp{4kTiYde+A!vTfrZpn@Z!BU zc}9>8#J<74?);8MH7kfv?#2>xn_c%Bm8P2VU zFO0%;x>;X9`n_n$eOduMD{w*MC$$>d!!@_AA~!3^7v0uPpsVGzU|b#zPin>#Akzl zco;<4A{d(dr&nLhd!Tt^#b9q`22+<<80_#i%EY9=Ijd5T3s3Ozsb=6NXmnbraj`C5 z3Xz5$%#HZ;)8>2F!OuJ8Y$~*YY=%Wv`0lB<7f?fk7If4)GxN$t{=@q)$#taZeosq| zt4XR(g~!Z%MrB3a4EGK%Z_DXu{vxB+U;PgB3<6NI0%gQNRT%2g7d{!X7F(KNWwVWK zo!OA0aPR9Go6-Wg$b!eooKhq) zI64?->&zlc5!#||=z)R-(9G9pfy}3gS)GI!?k_YbBfo8~Vy=nty+680TRz!hhkz44 zQSFqZ4T6mnqN?rZD;lPlMmtOIJd>{AA?~2hlZGrpz--w`|5K6WII!B|s3lv{UG!SS zLF1>*H`M~mxdQ*pH}{732biS?C+xl1*@tLn%BRe~exYGYrEL;vS5aC)DhZIgqElqc z*5OK!f|(sbO8X)1r*8f_vzD}k7^Em09?dylQDKv4Btz3$V{ir|=6eR@!kETUaE+by z>?EITb+nY%eV|!xa~V}~hcp>uXRxWR{%w>N8ZqI5ORi`e&_?XVUt$Ie>c|TgL1mtb z79i=U8+`*MxgqzaJgak67Ezq1FC}o~RB3Ow1t};YdOw-{rQ}u&fBtDS0nTTzqq<#D zAyMwTUwO+%Q~r>fWx-^lCM2-OZ)jz~Ad!rz19@?CAzCVf^Spe}vn~!bE%V{q6!HQy z{uzh~_x@TN?=5P9Z$i$z_r6!ZXEE%Y6I8M;Io@i9)b31l7%XFib(?ZPi!rqeqV8;5 z5;LqyY}U`vZ%>ezJ;%7A;)U}0Vdt^f_^(#qO8aZ~U7DBYkc#BjbD~#UTXNnj%X@aX6;<; z(AFqu(W~B%aqh3%OrJi8`QoqKp<^rE?yycWTR#c)2lrAb%*m8B_&|i$t|=xIzd$u* zgp>BsXjdTn>J*{e?u5RG8E&+9!OU{f-gLwd??=nFuWs$i+PMZ)YZyonE%$h4zoR-| zRoHcz+wR#gM;UpJB_-cVN!H|xj~xq@^?;4~J?YRmOG zv=BSFAbfX?Q9JgTG%E0*^vo%$tJ_s@%W6_9VPI@1*?av{X&>pR#rgBST#{E50^ua* zA(kuR6YU5&UfwM5;U3v?-dVC#pG+<9xxdr%I8(vM+}NdcTdf&J1K$h@SFX)`g$Zf4 zh&^<+wbqOjLdy5|4fRF+cRue$;0rs{v-b54-tIqm-%+mDmE=gj7nY8+m=Enedt9*K zwrR(x7PqN`?@5~SYvY@9CPu7p>QH07hpov?vmK(0e;+1h?z)kRN8hNWR$3+P-~EW_ z>{|E?OE7Ahrp@I0{f{aLqUyYKn<1Dc!fe{7U}pYjFiWHXM3OsD0Dir`B}G(;mk|o% z(iFI0u@2BJ(qXp#&;ZEyA$=Bb+-qx@esfDJ*8`5Gj94Uy%-!^WAKFTX^T6N^)*`yR zJZ1s(!PPC5+@%$XeZVEeT8h}~84eTHBEVj^A6!}!z}?GhS7?9WmjC6|#tLvt@I3^V Ypy%Gr0K9yLTmS$7 literal 0 HcmV?d00001 diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp new file mode 100644 index 0000000000..f1b374adbf --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp @@ -0,0 +1,264 @@ +#include "ck_tile/host.hpp" +#include "moe_smoothquant.hpp" +#include +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("t", "3328", "tokens dimension") + .insert("h", "4096", "hidden_size dimension") + .insert("e", "32", "experts") + .insert("k", "5", "topk") + .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t tokens = arg_parser.get_int("t"); + ck_tile::index_t hidden_size = arg_parser.get_int("h"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = hidden_size; + ck_tile::index_t experts = arg_parser.get_int("e"); + ck_tile::index_t topk = arg_parser.get_int("k"); + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= hidden_size); + + using TypeConfig = MoeSmoothquantTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using XScaleDataType = typename TypeConfig::XScaleDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor x_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor xscale_host({experts * hidden_size}); + ck_tile::HostTensor topk_ids_host({tokens, topk}); + + ck_tile::HostTensor yscale_host_ref({topk * tokens}, {1}); + ck_tile::HostTensor yscale_host_dev({topk * tokens}, {1}); + + ck_tile::HostTensor qy_host_ref({topk * tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({topk * tokens, hidden_size}, {stride, 1}); + + topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11937); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + xscale_buf.ToDevice(xscale_host.data()); + topk_ids_buf.ToDevice(topk_ids_host.data()); + + std::cout << "[" << data_type << "]" + << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride + << ", experts:" << experts << ", topk:" << topk << std::flush; + + moe_smoothquant_traits traits{data_type}; + + moe_smoothquant_args args{x_buf.GetDeviceBuffer(), + xscale_buf.GetDeviceBuffer(), + topk_ids_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + tokens, + hidden_size, + experts, + topk, + stride, + stride}; + + float ave_time = moe_smoothquant( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = + sizeof(XDataType) * tokens * hidden_size + sizeof(XScaleDataType) * topk * hidden_size + + sizeof(YScaleDataType) * topk * tokens + sizeof(QYDataType) * topk * tokens * hidden_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + ck_tile::HostTensor y_host({topk * tokens, hidden_size}, {stride, 1}); + // smooth outlier + { + auto f = [&](auto i_token) { + for(int i_topk = 0; i_topk < topk; i_topk++) + { + auto i_expert = topk_ids_host(i_token, i_topk); + + for(int i_h = 0; i_h < hidden_size; ++i_h) + { + auto v_xscale = ck_tile::type_convert( + xscale_host(i_expert * hidden_size + i_h)); + auto v_x = ck_tile::type_convert(x_host(i_token, i_h)); + // y_host(i_token * topk + i_topk, i_h) = v_x * v_xscale; + y_host(i_topk * tokens + i_token, i_h) = v_x * v_xscale; + } + } + }; + + ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency()); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({topk * tokens}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == hidden_size) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < topk * tokens; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + + hidden_size); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + + hidden_size); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp new file mode 100644 index 0000000000..9f9adda90f --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/smoothquant.hpp" +#include + +template +struct MoeSmoothquantTypeConfig; + +template <> +struct MoeSmoothquantTypeConfig +{ + using XDataType = ck_tile::half_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +template <> +struct MoeSmoothquantTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using XScaleDataType = float; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; +}; + +// runtime args +struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct moe_smoothquant_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Generic2dBlockShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +template +float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a); + +// This is the public API, will be generated by script +struct moe_smoothquant_traits +{ + std::string data_type; +}; + +float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/14_moe_smoothquant/script/perf_test.sh b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh new file mode 100755 index 0000000000..d1e848b930 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh @@ -0,0 +1,37 @@ + +EXE=build/bin/tile_example_moe_smoothquant + +$EXE -t=1 -h=1 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=80 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=128 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=144 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=168 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=184 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=256 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=288 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=344 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=376 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=448 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=512 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=924 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1024 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1078 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=1996 -v=1 -prec=bf16 -repeat=1000 +$EXE -t=700 -h=4080 -v=1 -prec=bf16 -repeat=1000 + +$EXE -t=700 -h=80 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=128 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=144 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=168 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=184 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=256 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=288 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=344 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=376 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=448 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=512 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=924 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1024 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1078 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=1996 -v=1 -prec=fp16 -repeat=1000 +$EXE -t=700 -h=4080 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh new file mode 100755 index 0000000000..3bb62d37b9 --- /dev/null +++ b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh @@ -0,0 +1,30 @@ +#!/bin/sh +EXE=build/bin/tile_example_moe_smoothquant + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -t=99 -h=13 +$EXE -prec=$pr_i -t=17 -h=16 +$EXE -prec=$pr_i -t=1 -h=100 +$EXE -prec=$pr_i -t=4 -h=128 +$EXE -prec=$pr_i -t=80 -h=127 +$EXE -prec=$pr_i -t=22 -h=255 -stride=256 +$EXE -prec=$pr_i -t=7 -h=599 +$EXE -prec=$pr_i -t=19 -h=512 +$EXE -prec=$pr_i -t=33 -h=313 -stride=1000 +$EXE -prec=$pr_i -t=11 -h=510 +$EXE -prec=$pr_i -t=171 -h=676 -stride=818 +$EXE -prec=$pr_i -t=91 -h=636 +$EXE -prec=$pr_i -t=12 -h=768 -stride=800 +$EXE -prec=$pr_i -t=100 -h=766 -stride=812 +$EXE -prec=$pr_i -t=31 -h=1024 +$EXE -prec=$pr_i -t=64 -h=1000 -stride=1004 +$EXE -prec=$pr_i -t=8 -h=1501 +$EXE -prec=$pr_i -t=3 -h=1826 +$EXE -prec=$pr_i -t=5 -h=2040 +$EXE -prec=$pr_i -t=7 -h=2734 +$EXE -prec=$pr_i -t=1 -h=3182 +$EXE -prec=$pr_i -t=9 -h=4096 +$EXE -prec=$pr_i -t=3 -h=8192 +$EXE -prec=$pr_i -t=1 -h=10547 +$EXE -prec=$pr_i -t=3 -h=17134 +done diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 15db0f46c4..b6a44f76b7 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -13,3 +13,4 @@ add_subdirectory(10_rmsnorm2d) add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) +add_subdirectory(14_moe_smoothquant) diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 604c9551ff..a15d2c0402 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -64,6 +64,7 @@ #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3 +#define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE @@ -225,3 +226,7 @@ #ifndef CK_TILE_WORKAROUND_SWDEV_383542 #define CK_TILE_WORKAROUND_SWDEV_383542 1 #endif + +#ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID +#define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1 +#endif diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp index 5f4b64466e..499ba80a88 100644 --- a/include/ck_tile/core/numeric/bfloat16.hpp +++ b/include/ck_tile/core/numeric/bfloat16.hpp @@ -18,6 +18,7 @@ enum class bf16_rounding_mode truncate_with_nan, truncate, standard_asm, + rta_asm, // round to nearest away }; template (((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24)) + template CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, const HostTensor& weights, @@ -20,8 +23,14 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, { const index_t num_token = topk_ids.mDesc.get_lengths()[0]; const index_t topk = topk_ids.mDesc.get_lengths()[1]; - std::vector> expert_tokens(experts, - std::vector(unit_size, num_token)); + // allocate a temp buffer, and fill the value with [number_token|topk] + std::vector> expert_tokens( + experts, +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + std::vector(unit_size, MOE_SORTING_MOCK_ID(num_token, topk))); +#else + std::vector(unit_size, num_token)); +#endif std::vector> expert_token_weights( experts, std::vector(unit_size, 0)); std::vector expert_slices(experts, 1); @@ -42,12 +51,19 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, expert_token_weights[e].resize(new_size); for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++) { - expert_tokens[e][i] = num_token; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk); +#else + expert_tokens[e][i] = num_token; +#endif expert_token_weights[e][i] = 0; } } - - expert_tokens[e][idx] = t; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k); +#else + expert_tokens[e][idx] = t; +#endif expert_token_weights[e][idx] = w; expert_slice_idxs[e]++; } @@ -75,4 +91,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor& topk_ids, unit_cnt *= unit_size; return; } + +#undef MOE_SORTING_MOCK_ID + } // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index 1c6acec70e..d9e28ceb52 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -12,20 +12,77 @@ namespace ck_tile { +#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \ + static_cast(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24)) + +// clang-format off +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// * Note on token_id_per_expert/sorted_token_ids_ptr data: +// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr. +// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from +// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr +// +// 32bit 0........23 24.....31 bit +// (data) -> (token_id | topk_id) +// low 24 bit is for token id, top 8 bit is for topk id +// +// the input after smooth-quant is [topk, token, hidden_dim], originally it is [token, hidden_dim] +// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim] +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) struct MoeSortingHostArgs { - const void* p_topk_ids; - const void* p_weights; + const void* p_topk_ids; // [token, topk] + const void* p_weights; // [token, topk] void* p_sorted_token_ids; void* p_sorted_weights; void* p_sorted_expert_ids; void* p_total_tokens_post_pad; + // we fused the setzero of output of fused-moe buffer + // set this pointer to nullptr will skip this operation void* p_moe_buf; index_t tokens; - index_t unit_size; + index_t unit_size; // this is the M_a of fused-moe kernel index_t num_experts; index_t topk; - index_t moe_buf_bytes; + index_t moe_buf_bytes; // byte size of p_moe_buf }; template @@ -183,8 +240,14 @@ struct MoeSortingKernel index_t expert_id = topk_id[i]; index_t rank_post_pad = tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id]; +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + uint32_t curr_token_id, curr_topk_id; + topk_mdiv.divmod(i, curr_token_id, curr_topk_id); + p_sorted_token_ids[rank_post_pad] = MOE_SORTING_MOCK_ID(curr_token_id, curr_topk_id); +#else p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i); - p_sorted_weights[rank_post_pad] = weights[i]; +#endif + p_sorted_weights[rank_post_pad] = weights[i]; ++tokens_cnts[calc_index(num_experts, tid, expert_id)]; } @@ -195,8 +258,13 @@ struct MoeSortingKernel cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)]; while(expert_offset < cumsum[tid + 1]) { +#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID + p_sorted_token_ids[expert_offset] = + MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor); +#else p_sorted_token_ids[expert_offset] = prefill_token; - p_sorted_weights[expert_offset] = static_cast(0.0); +#endif + p_sorted_weights[expert_offset] = static_cast(0.0); expert_offset++; } } @@ -229,4 +297,7 @@ struct MoeSortingKernel smem); } }; + +#undef MOE_SORTING_MOCK_ID + } // namespace ck_tile diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp index c9e4597657..24a59b45b0 100644 --- a/include/ck_tile/ops/smoothquant.hpp +++ b/include/ck_tile/ops/smoothquant.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp" #include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp" #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp" #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp" diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp new file mode 100644 index 0000000000..1bece521f5 --- /dev/null +++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +// host side args +struct MoeSmoothquantHostArgs +{ + const void* p_x; // [tokens ,hidden_size], input, fp16/bf16 + const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32 + const void* p_topk_ids; // [tokens, topk] + + void* p_yscale; // [topk * tokens, 1], output, rowwise quant scale + void* p_qy; // [topk * tokens, hidden_size], output + + index_t tokens; + index_t hidden_size; + index_t experts; + index_t topk; + index_t x_stride; // input x row stride + index_t y_stride; // output y stride(stride for topk) +}; + +// TODO: Extract some type to wrapper class +template +struct MoeSmoothquant +{ + using Pipeline = remove_cvref_t; + using Problem = typename Pipeline::Problem; + + using XDataType = remove_cvref_t; + using XScaleDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YScaleDataType = remove_cvref_t; + using QYDataType = remove_cvref_t; + + static constexpr index_t Block_M = Problem::BlockShape::Block_M; + static constexpr index_t Block_N = Problem::BlockShape::Block_N; + static constexpr bool kPadM = false; // always no need to pad along M + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kTwoPass = Problem::kTwoPass; + + static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N; + static constexpr index_t Vector_N = Problem::BlockShape::Vector_N; + static constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N; + + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + + static_assert(Problem::BlockShape::Repeat_M == 1); + + struct Kargs + { + const void* p_x; // [tokens ,hidden_size], input, fp16/bf16 + const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32 + const void* p_topk_ids; // [tokens, topk] + + void* p_yscale; // [topk, tokens, 1], output, rowwise quant scale + void* p_qy; // [topk, tokens, hidden_size], output + + index_t tokens; + index_t hidden_size; + index_t experts; + index_t topk; + index_t x_stride; // input x row stride + index_t y_stride; // output y stride(stride for topk) + }; + using Hargs = MoeSmoothquantHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + return Kargs{hargs.p_x, + hargs.p_xscale, + hargs.p_topk_ids, + hargs.p_yscale, + hargs.p_qy, + hargs.tokens, + hargs.hidden_size, + hargs.experts, + hargs.topk, + hargs.x_stride, + hargs.y_stride}; + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + return dim3(hargs.topk, integer_divide_ceil(hargs.tokens, Block_M), 1); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; } + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + // clang-format on + + // in byte + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_HOST static std::string GetName() + { + // clang-format off + using S_ = typename Problem::BlockShape; + auto surfix = [&] () { + std::string n; + if (kPadN) n += "_pn"; + if (kTwoPass) n += "_2p"; + return n; }(); + + #define _SS_ std::string + #define _TS_ std::to_string + return _SS_("moe_smoothquant_") + _SS_(t2s::name) + "_" + + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _SS_(Pipeline::name) + surfix; + #undef _SS_ + #undef _TS_ + // clang-format on + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const index_t i_topk = blockIdx.x; + const index_t i_token = blockIdx.y * Block_M; + const index_t i_token_in_thrd = + __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); + + const index_t i_expert = reinterpret_cast( + kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk]; + + // [tokens ,hidden_size] + const auto x_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x), + make_tuple(kargs.tokens, kargs.hidden_size), + make_tuple(kargs.x_stride, 1), + number{}, + number<1>{}); + + const auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {i_token, 0}); + }(); + + // [experts, hidden_size], + const auto xscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_xscale) + i_expert * kargs.hidden_size, + make_tuple(kargs.hidden_size), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + }(); + + // [topk, tokens] + auto yscale_window = [&]() { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_yscale) + i_topk * kargs.tokens, + make_tuple(kargs.tokens), + make_tuple(1), + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {i_token}); + }(); + + // [topk, tokens, hidden_size] + auto qy_window = [&]() { + auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_qy) + i_topk * kargs.tokens * kargs.y_stride, + make_tuple(kargs.tokens, kargs.hidden_size), + make_tuple(kargs.y_stride, 1), + number{}, + number<1>{}); + + auto tmp2_ = pad_tensor_view( + tmp_, make_tuple(number{}, number{}), sequence{}); + return make_tile_window( + tmp2_, make_tuple(number{}, number{}), {i_token, 0}); + }(); + + __shared__ char smem[GetSmemSize()]; + + Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.hidden_size, smem); + } +}; + +} // namespace ck_tile From c2bcbb1379c31a068234216a585027a91be57fee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 24 Nov 2024 21:41:52 -0800 Subject: [PATCH 12/52] Bump rocm-docs-core from 1.8.5 to 1.9.0 in /docs/sphinx (#1691) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.5 to 1.9.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.9.0/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.5...v1.9.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 3a2e266ef5..5bec504a08 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.8.5 +rocm-docs-core==1.9.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index b65d2391f6..8881c0e746 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.8.5 +rocm-docs-core==1.9.0 # via -r requirements.in six==1.16.0 # via pybtex From 645fe812f65db86a9eaca7ae00e0004c1634bc0a Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Mon, 25 Nov 2024 15:30:35 +0800 Subject: [PATCH 13/52] [CK_TILE] Fix fMHA fwd MakeKargs() compilation errors (#1689) * Fix mis-matched tuple<> elem types * Rename MakeKargs() as MakeKargsImpl() --------- Co-authored-by: Qianfeng --- example/ck_tile/01_fmha/fmha_bwd.hpp | 208 +++++----- example/ck_tile/01_fmha/fmha_fwd.hpp | 156 ++++---- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 232 +++++------ .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 370 +++++++++--------- 4 files changed, 484 insertions(+), 482 deletions(-) diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 3b21a3257f..722ef15a2f 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -150,113 +150,113 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args) // create group mode kernel arguments if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode) { - return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.lse_ptr, - args.do_ptr, - args.d_ptr, - args.rand_val_ptr, - args.dk_ptr, - args.dv_ptr, - args.dbias_ptr, - args.dq_acc_ptr, - args.seqstart_q_ptr, - args.seqstart_k_ptr, - args.seqlen_k_ptr, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_do, - args.stride_dq_acc, - args.stride_dk, - args.stride_dv, - args.stride_dbias, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_do, - args.nhead_stride_lsed, - args.nhead_stride_dq_acc, - args.nhead_stride_dk, - args.nhead_stride_dv, - args.nhead_stride_dbias, - args.split_stride_dq_acc, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.drop_seed_offset); + return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.lse_ptr, + args.do_ptr, + args.d_ptr, + args.rand_val_ptr, + args.dk_ptr, + args.dv_ptr, + args.dbias_ptr, + args.dq_acc_ptr, + args.seqstart_q_ptr, + args.seqstart_k_ptr, + args.seqlen_k_ptr, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_do, + args.stride_dq_acc, + args.stride_dk, + args.stride_dv, + args.stride_dbias, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_do, + args.nhead_stride_lsed, + args.nhead_stride_dq_acc, + args.nhead_stride_dk, + args.nhead_stride_dv, + args.nhead_stride_dbias, + args.split_stride_dq_acc, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.drop_seed_offset); } else { // create batch mode kernel arguments - return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.lse_ptr, - args.do_ptr, - args.d_ptr, - args.rand_val_ptr, - args.dk_ptr, - args.dv_ptr, - args.dbias_ptr, - args.dq_acc_ptr, - args.seqlen_q, - args.seqlen_k, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_do, - args.stride_dq_acc, - args.stride_dk, - args.stride_dv, - args.stride_dbias, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_do, - args.nhead_stride_lsed, - args.nhead_stride_dq_acc, - args.nhead_stride_dk, - args.nhead_stride_dv, - args.nhead_stride_dbias, - args.batch_stride_q, - args.batch_stride_k, - args.batch_stride_v, - args.batch_stride_bias, - args.batch_stride_randval, - args.batch_stride_do, - args.batch_stride_lsed, - args.batch_stride_dq_acc, - args.batch_stride_dk, - args.batch_stride_dv, - args.batch_stride_dbias, - args.split_stride_dq_acc, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.drop_seed_offset); + return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.lse_ptr, + args.do_ptr, + args.d_ptr, + args.rand_val_ptr, + args.dk_ptr, + args.dv_ptr, + args.dbias_ptr, + args.dq_acc_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_do, + args.stride_dq_acc, + args.stride_dk, + args.stride_dv, + args.stride_dbias, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_do, + args.nhead_stride_lsed, + args.nhead_stride_dq_acc, + args.nhead_stride_dk, + args.nhead_stride_dv, + args.nhead_stride_dbias, + args.batch_stride_q, + args.batch_stride_k, + args.batch_stride_v, + args.batch_stride_bias, + args.batch_stride_randval, + args.batch_stride_do, + args.batch_stride_lsed, + args.batch_stride_dq_acc, + args.batch_stride_dk, + args.batch_stride_dv, + args.batch_stride_dbias, + args.split_stride_dq_acc, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.drop_seed_offset); } }(); diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 41edac67ba..704453baa4 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -281,87 +281,87 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args) // create group mode kernel arguments if constexpr(FmhaKernel::kIsGroupMode) { - return FmhaKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqstart_q_ptr, - args.seqstart_k_ptr, - args.seqlen_k_ptr, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + return FmhaKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqstart_q_ptr, + args.seqstart_k_ptr, + args.seqlen_k_ptr, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } else { // create batch mode kernel arguments - return FmhaKernel::MakeKargs(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqlen_q, - args.seqlen_k, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.batch_stride_q, - args.batch_stride_k, - args.batch_stride_v, - args.batch_stride_bias, - args.batch_stride_randval, - args.batch_stride_lse, - args.batch_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + return FmhaKernel::MakeKargsImpl(args.q_ptr, + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.batch_stride_q, + args.batch_stride_k, + args.batch_stride_v, + args.batch_stride_bias, + args.batch_stride_randval, + args.batch_stride_lse, + args.batch_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } }(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index ccf15ee600..23174528e7 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -304,64 +304,64 @@ struct FmhaBwdDQDKDVKernel template CK_TILE_HOST static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - const void* lse_ptr, - const void* do_ptr, - const void* d_ptr, - void* rand_val_ptr, - void* dk_ptr, - void* dv_ptr, - void* dbias_ptr, - void* dq_acc_ptr, - ck_tile::index_t seqlen_q, - ck_tile::index_t seqlen_k, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_do, - ck_tile::index_t stride_dq_acc, - ck_tile::index_t stride_dk, - ck_tile::index_t stride_dv, - ck_tile::index_t stride_dbias, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_do, - ck_tile::index_t nhead_stride_lsed, - ck_tile::index_t nhead_stride_dq_acc, - ck_tile::index_t nhead_stride_dk, - ck_tile::index_t nhead_stride_dv, - ck_tile::index_t nhead_stride_dbias, - ck_tile::index_t batch_stride_q, - ck_tile::index_t batch_stride_k, - ck_tile::index_t batch_stride_v, - ck_tile::index_t batch_stride_bias, - ck_tile::index_t batch_stride_randval, - ck_tile::index_t batch_stride_do, - ck_tile::index_t batch_stride_lsed, - ck_tile::index_t batch_stride_dq_acc, - ck_tile::index_t batch_stride_dk, - ck_tile::index_t batch_stride_dv, - ck_tile::index_t batch_stride_dbias, - ck_tile::index_t split_stride_dq_acc, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - std::variant, std::pair> - drop_seed_offset) + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_do, + ck_tile::index_t batch_stride_lsed, + ck_tile::index_t batch_stride_dq_acc, + ck_tile::index_t batch_stride_dk, + ck_tile::index_t batch_stride_dv, + ck_tile::index_t batch_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -470,7 +470,7 @@ struct FmhaBwdDQDKDVKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -531,7 +531,7 @@ struct FmhaBwdDQDKDVKernel float p_drop, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -591,7 +591,7 @@ struct FmhaBwdDQDKDVKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -650,9 +650,9 @@ struct FmhaBwdDQDKDVKernel ck_tile::index_t window_size_right, ck_tile::index_t mask_type, float p_drop, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -714,54 +714,54 @@ struct FmhaBwdDQDKDVKernel template CK_TILE_HOST static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - const void* lse_ptr, - const void* do_ptr, - const void* d_ptr, - void* rand_val_ptr, - void* dk_ptr, - void* dv_ptr, - void* dbias_ptr, - void* dq_acc_ptr, - const void* seqstart_q_ptr, - const void* seqstart_k_ptr, - const void* seqlen_k_ptr, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_do, - ck_tile::index_t stride_dq_acc, - ck_tile::index_t stride_dk, - ck_tile::index_t stride_dv, - ck_tile::index_t stride_dbias, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_do, - ck_tile::index_t nhead_stride_lsed, - ck_tile::index_t nhead_stride_dq_acc, - ck_tile::index_t nhead_stride_dk, - ck_tile::index_t nhead_stride_dv, - ck_tile::index_t nhead_stride_dbias, - ck_tile::index_t split_stride_dq_acc, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - std::variant, std::pair> - drop_seed_offset) + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + const void* lse_ptr, + const void* do_ptr, + const void* d_ptr, + void* rand_val_ptr, + void* dk_ptr, + void* dv_ptr, + void* dbias_ptr, + void* dq_acc_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_do, + ck_tile::index_t stride_dq_acc, + ck_tile::index_t stride_dk, + ck_tile::index_t stride_dv, + ck_tile::index_t stride_dbias, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_do, + ck_tile::index_t nhead_stride_lsed, + ck_tile::index_t nhead_stride_dq_acc, + ck_tile::index_t nhead_stride_dk, + ck_tile::index_t nhead_stride_dv, + ck_tile::index_t nhead_stride_dbias, + ck_tile::index_t split_stride_dq_acc, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -858,7 +858,7 @@ struct FmhaBwdDQDKDVKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -909,7 +909,7 @@ struct FmhaBwdDQDKDVKernel float p_drop, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -959,7 +959,7 @@ struct FmhaBwdDQDKDVKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, @@ -1008,9 +1008,9 @@ struct FmhaBwdDQDKDVKernel ck_tile::index_t window_size_right, ck_tile::index_t mask_type, float p_drop, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 4443a45038..3de433d6a7 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -64,7 +64,7 @@ struct FmhaFwdKernel template <> struct t2s { static constexpr const char * name = "bf8"; }; // clang-format on - __host__ static std::string GetName() + CK_TILE_HOST static std::string GetName() { // sync with generate.py // clang-format off @@ -267,50 +267,50 @@ struct FmhaFwdKernel using Kargs = std::conditional_t; template - __host__ static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - void* rand_val_ptr, - void* lse_ptr, - void* o_ptr, - ck_tile::index_t seqlen_q, - ck_tile::index_t seqlen_k, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale_s, - float scale_p, - float scale_o, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_o, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_lse, - ck_tile::index_t nhead_stride_o, - ck_tile::index_t batch_stride_q, - ck_tile::index_t batch_stride_k, - ck_tile::index_t batch_stride_v, - ck_tile::index_t batch_stride_bias, - ck_tile::index_t batch_stride_randval, - ck_tile::index_t batch_stride_lse, - ck_tile::index_t batch_stride_o, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - bool s_randval, - std::variant, std::pair> - drop_seed_offset) + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + ck_tile::index_t seqlen_q, + ck_tile::index_t seqlen_k, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t batch_stride_q, + ck_tile::index_t batch_stride_k, + ck_tile::index_t batch_stride_v, + ck_tile::index_t batch_stride_bias, + ck_tile::index_t batch_stride_randval, + ck_tile::index_t batch_stride_lse, + ck_tile::index_t batch_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -399,9 +399,9 @@ struct FmhaFwdKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -445,53 +445,54 @@ struct FmhaFwdKernel bool s_randval, const std::tuple& drop_seed_offset) { - MakeKargs(q_ptr, - k_ptr, - v_ptr, - bias_ptr, - rand_val_ptr, - lse_ptr, - o_ptr, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - num_head_q, - nhead_ratio_qk, - scale_s, - scale_p, - scale_o, - stride_q, - stride_k, - stride_v, - stride_bias, - stride_randval, - stride_o, - nhead_stride_q, - nhead_stride_k, - nhead_stride_v, - nhead_stride_bias, - nhead_stride_randval, - nhead_stride_lse, - nhead_stride_o, - batch_stride_q, - batch_stride_k, - batch_stride_v, - batch_stride_bias, - batch_stride_randval, - batch_stride_lse, - batch_stride_o, - window_size_left, - window_size_right, - mask_type, - p_drop, - s_randval, - std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + return MakeKargsImpl( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -533,91 +534,92 @@ struct FmhaFwdKernel ck_tile::index_t mask_type, float p_drop, bool s_randval, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - MakeKargs(q_ptr, - k_ptr, - v_ptr, - bias_ptr, - rand_val_ptr, - lse_ptr, - o_ptr, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - num_head_q, - nhead_ratio_qk, - scale_s, - scale_p, - scale_o, - stride_q, - stride_k, - stride_v, - stride_bias, - stride_randval, - stride_o, - nhead_stride_q, - nhead_stride_k, - nhead_stride_v, - nhead_stride_bias, - nhead_stride_randval, - nhead_stride_lse, - nhead_stride_o, - batch_stride_q, - batch_stride_k, - batch_stride_v, - batch_stride_bias, - batch_stride_randval, - batch_stride_lse, - batch_stride_o, - window_size_left, - window_size_right, - mask_type, - p_drop, - s_randval, - std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); + return MakeKargsImpl( + q_ptr, + k_ptr, + v_ptr, + bias_ptr, + rand_val_ptr, + lse_ptr, + o_ptr, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + num_head_q, + nhead_ratio_qk, + scale_s, + scale_p, + scale_o, + stride_q, + stride_k, + stride_v, + stride_bias, + stride_randval, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + nhead_stride_bias, + nhead_stride_randval, + nhead_stride_lse, + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + batch_stride_bias, + batch_stride_randval, + batch_stride_lse, + batch_stride_o, + window_size_left, + window_size_right, + mask_type, + p_drop, + s_randval, + std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } template - __host__ static constexpr std::enable_if_t - MakeKargs(const void* q_ptr, - const void* k_ptr, - const void* v_ptr, - const void* bias_ptr, - void* rand_val_ptr, - void* lse_ptr, - void* o_ptr, - const void* seqstart_q_ptr, - const void* seqstart_k_ptr, - const void* seqlen_k_ptr, - ck_tile::index_t hdim_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_head_q, - ck_tile::index_t nhead_ratio_qk, - float scale_s, - float scale_p, - float scale_o, - ck_tile::index_t stride_q, - ck_tile::index_t stride_k, - ck_tile::index_t stride_v, - ck_tile::index_t stride_bias, - ck_tile::index_t stride_randval, - ck_tile::index_t stride_o, - ck_tile::index_t nhead_stride_q, - ck_tile::index_t nhead_stride_k, - ck_tile::index_t nhead_stride_v, - ck_tile::index_t nhead_stride_bias, - ck_tile::index_t nhead_stride_randval, - ck_tile::index_t nhead_stride_lse, - ck_tile::index_t nhead_stride_o, - ck_tile::index_t window_size_left, - ck_tile::index_t window_size_right, - ck_tile::index_t mask_type, - float p_drop, - bool s_randval, - std::variant, std::pair> - drop_seed_offset) + CK_TILE_HOST static constexpr std::enable_if_t + MakeKargsImpl(const void* q_ptr, + const void* k_ptr, + const void* v_ptr, + const void* bias_ptr, + void* rand_val_ptr, + void* lse_ptr, + void* o_ptr, + const void* seqstart_q_ptr, + const void* seqstart_k_ptr, + const void* seqlen_k_ptr, + ck_tile::index_t hdim_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_head_q, + ck_tile::index_t nhead_ratio_qk, + float scale_s, + float scale_p, + float scale_o, + ck_tile::index_t stride_q, + ck_tile::index_t stride_k, + ck_tile::index_t stride_v, + ck_tile::index_t stride_bias, + ck_tile::index_t stride_randval, + ck_tile::index_t stride_o, + ck_tile::index_t nhead_stride_q, + ck_tile::index_t nhead_stride_k, + ck_tile::index_t nhead_stride_v, + ck_tile::index_t nhead_stride_bias, + ck_tile::index_t nhead_stride_randval, + ck_tile::index_t nhead_stride_lse, + ck_tile::index_t nhead_stride_o, + ck_tile::index_t window_size_left, + ck_tile::index_t window_size_right, + ck_tile::index_t mask_type, + float p_drop, + bool s_randval, + std::variant, std::pair> + drop_seed_offset) { Kargs kargs{{q_ptr, k_ptr, @@ -702,9 +704,9 @@ struct FmhaFwdKernel return kargs; } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -742,7 +744,7 @@ struct FmhaFwdKernel bool s_randval, const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -781,9 +783,9 @@ struct FmhaFwdKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - // std::variant can't take in a list initializer, overload for backward compatibility + // std::variant<> can't take in a list initializer, overload for backward compatibility template - __host__ static constexpr std::enable_if_t + CK_TILE_HOST static constexpr std::enable_if_t MakeKargs(const void* q_ptr, const void* k_ptr, const void* v_ptr, @@ -819,9 +821,9 @@ struct FmhaFwdKernel ck_tile::index_t mask_type, float p_drop, bool s_randval, - const std::tuple& drop_seed_offset) + const std::tuple& drop_seed_offset) { - return MakeKargs( + return MakeKargsImpl( q_ptr, k_ptr, v_ptr, @@ -860,15 +862,15 @@ struct FmhaFwdKernel std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset))); } - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_, - ck_tile::index_t nhead_, - ck_tile::index_t seqlen_q_, - ck_tile::index_t hdim_v_) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_, + ck_tile::index_t nhead_, + ck_tile::index_t seqlen_q_, + ck_tile::index_t hdim_v_) { return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_); } - __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); } + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); } CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() { From 440e28b08fa0f503c229f5787be4f775ad20484c Mon Sep 17 00:00:00 2001 From: carlushuang Date: Tue, 26 Nov 2024 11:14:56 +0800 Subject: [PATCH 14/52] [CK_TILE] fused-moe first version (#1634) * moe pipeline * update code * compile OK * update * update cpu reference * update pipeline_gemm0 * compiler ok * update pipeline * rename to ex pipeline * block-asm * update * update * update first gemm ok * compute correct * update file structure * update README * update * update * update code * update API * return unsupport case * add comment * update readme * update * uncomment * update * fix build err --------- Co-authored-by: valarLip <340077269@qq.com> --- .../alternative_impl/matrix_core_swizzle.cpp | 4 +- .../matrix_core_swizzle_kernel.hpp | 12 +- example/ck_tile/06_permute/permute.cpp | 2 +- .../13_moe_sorting/moe_sorting_api.hpp | 2 +- example/ck_tile/15_fused_moe/CMakeLists.txt | 19 + example/ck_tile/15_fused_moe/README.md | 69 ++ example/ck_tile/15_fused_moe/fused_moe.hpp | 52 ++ .../ck_tile/15_fused_moe/fused_moegemm.hpp | 84 ++ .../ck_tile/15_fused_moe/fused_moesorting.hpp | 20 + .../15_fused_moe/instances/fused_moe_api.cpp | 80 ++ .../instances/fused_moegemm_api.cpp | 33 + .../instances/fused_moegemm_api_internal.hpp | 60 ++ .../instances/fused_moegemm_api_traits.hpp | 53 ++ .../instances/fused_moegemm_bf16_m32.cpp | 14 + .../instances/fused_moegemm_fp16_m32.cpp | 14 + .../instances/fused_moesorting_api.cpp | 73 ++ example/ck_tile/15_fused_moe/main.cpp | 603 +++++++++++++ example/ck_tile/15_fused_moe/misc/moe-0.png | Bin 0 -> 76830 bytes example/ck_tile/15_fused_moe/misc/moe-1.png | Bin 0 -> 92535 bytes example/ck_tile/15_fused_moe/misc/moe-2.png | Bin 0 -> 126766 bytes example/ck_tile/15_fused_moe/misc/moe-3.png | Bin 0 -> 18655 bytes example/ck_tile/CMakeLists.txt | 2 + include/ck_tile/core.hpp | 2 + .../core/arch/amd_buffer_addressing.hpp | 103 +++ include/ck_tile/core/arch/arch.hpp | 18 + include/ck_tile/core/arch/utility.hpp | 24 + include/ck_tile/core/tensor/buffer_view.hpp | 86 +- include/ck_tile/core/tensor/load_tile.hpp | 54 +- .../core/tensor/static_distributed_tensor.hpp | 26 + include/ck_tile/core/tensor/tensor_view.hpp | 42 + include/ck_tile/core/tensor/tile_window.hpp | 74 +- .../core/tensor/tile_window_linear.hpp | 159 +++- .../ck_tile/core/tensor/tile_window_utils.hpp | 54 ++ include/ck_tile/core/tensor/update_tile.hpp | 56 +- .../ck_tile/core/utility/static_counter.hpp | 116 +++ include/ck_tile/host.hpp | 2 + include/ck_tile/host/device_memory.hpp | 35 + include/ck_tile/host/fill.hpp | 113 ++- include/ck_tile/host/host_tensor.hpp | 121 ++- include/ck_tile/host/joinable_thread.hpp | 27 + .../host/reference/reference_fused_moe.hpp | 196 +++++ .../host/reference/reference_permute.hpp | 23 +- .../unary_element_wise_operation.hpp | 99 +++ include/ck_tile/ops/flatmm.hpp | 10 + .../flatmm_32x512x128_1x4x1_16x16x32.hpp | 615 +++++++++++++ .../flatmm_sn_32x128x512_1x4x1_16x16x32.hpp | 562 ++++++++++++ .../ops/flatmm/block/flatmm_uk_config.hpp | 10 + include/ck_tile/ops/flatmm/block/uk/README.md | 1 + ...m_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc | 613 +++++++++++++ ...atmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc | 516 +++++++++++ .../block_fmha_pipeline_qr_ks_vs_async.hpp | 19 +- include/ck_tile/ops/fused_moe.hpp | 8 + .../fused_moe/kernel/fused_moegemm_kernel.hpp | 421 +++++++++ .../fused_moe/kernel/fused_moegemm_shape.hpp | 125 +++ .../kernel/fused_moegemm_tile_partitioner.hpp | 33 + .../fused_moegemm_pipeline_flatmm_ex.hpp | 651 ++++++++++++++ .../fused_moegemm_pipeline_flatmm_policy.hpp | 831 ++++++++++++++++++ .../fused_moegemm_pipeline_flatmm_uk.hpp | 354 ++++++++ .../fused_moegemm_pipeline_problem.hpp | 46 + .../pipeline/fused_moegemm_traits.hpp | 48 + include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 130 +-- .../gemm/warp/warp_gemm_attribute_mfma.hpp | 170 +++- .../warp/warp_gemm_attribute_mfma_impl.hpp | 455 +++++++--- .../ops/gemm/warp/warp_gemm_dispatcher.hpp | 58 +- .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp | 61 +- include/ck_tile/ops/moe_sorting.hpp | 11 - 66 files changed, 8066 insertions(+), 308 deletions(-) create mode 100644 example/ck_tile/15_fused_moe/CMakeLists.txt create mode 100644 example/ck_tile/15_fused_moe/README.md create mode 100644 example/ck_tile/15_fused_moe/fused_moe.hpp create mode 100644 example/ck_tile/15_fused_moe/fused_moegemm.hpp create mode 100644 example/ck_tile/15_fused_moe/fused_moesorting.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp create mode 100644 example/ck_tile/15_fused_moe/main.cpp create mode 100644 example/ck_tile/15_fused_moe/misc/moe-0.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-1.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-2.png create mode 100644 example/ck_tile/15_fused_moe/misc/moe-3.png create mode 100644 include/ck_tile/core/tensor/tile_window_utils.hpp create mode 100644 include/ck_tile/core/utility/static_counter.hpp create mode 100644 include/ck_tile/host/joinable_thread.hpp create mode 100644 include/ck_tile/host/reference/reference_fused_moe.hpp create mode 100644 include/ck_tile/ops/flatmm.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp create mode 100644 include/ck_tile/ops/flatmm/block/uk/README.md create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp delete mode 100644 include/ck_tile/ops/moe_sorting.hpp diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp index 93c662a288..e5ded0ef3b 100644 --- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp @@ -40,7 +40,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t, else if(t.permute.compare("0,1,3,4,2,5") == 0) { constexpr matrix_core_permute_style pstyle = - matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + matrix_core_permute_style::b_nr_kr_kw_nw_kv; using Kernel = matrix_core_swizzle_kernel; @@ -83,7 +83,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t, else if(t.permute.compare("0,1,3,4,2,5") == 0) { constexpr matrix_core_permute_style pstyle = - matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv; + matrix_core_permute_style::b_nr_kr_kw_nw_kv; using Kernel = matrix_core_swizzle_kernel; diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp index 60ac103ec3..28f4c452bc 100644 --- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp @@ -42,8 +42,8 @@ enum class matrix_core_permute_style { permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6 permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6 - permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 - permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, + b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, }; // assume this is B matrix, originally we have batch*n*k @@ -203,7 +203,7 @@ struct matrix_core_swizzle_kernel else { // clang-format off - // permute_b_nr_kr_kw_nw_kv or permute_b_nr_kr_waveflatten + // b_nr_kr_kw_nw_kv or b_nr_kr_waveflatten constexpr index_t Kv = Alignment; constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; @@ -332,7 +332,7 @@ struct matrix_core_swizzle_kernel make_tuple(sequence<0>{}, sequence<1>{})); return tmp_1; #else - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv, + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, constexpr index_t kv = Alignment; constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; @@ -376,13 +376,13 @@ struct matrix_core_swizzle_kernel else { #if MERGE_2D_013425 - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv return make_tile_window(dst_view, make_tuple(number{}, number{}), {i_n * NPerBlock, i_k * KPerBlock}, get_dst_dist()); #else - // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv + // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv constexpr index_t kv = Alignment; constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane; constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane; diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp index af95b64e69..477ae370b9 100644 --- a/example/ck_tile/06_permute/permute.cpp +++ b/example/ck_tile/06_permute/permute.cpp @@ -264,7 +264,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")) { - // permute_b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 + // b_nr_kr_kw_nw_kv = 2, // 0,1,3,4,2,5 matrix_core_swizzle_traits t; t.data_type = data_type; t.permute = arg_parser.get_str("perm"); diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp index 91b54932ce..0cb393f7de 100644 --- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp @@ -5,7 +5,7 @@ #include #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" -#include "ck_tile/ops/moe_sorting.hpp" +#include "ck_tile/ops/fused_moe.hpp" struct moe_sorting_trait { diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt new file mode 100644 index 0000000000..a716eef19e --- /dev/null +++ b/example/ck_tile/15_fused_moe/CMakeLists.txt @@ -0,0 +1,19 @@ +set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding ${TILE_EXAPMLE_FUSED_MOE}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp) +target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS}) + +set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a +list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta +# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1) +# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + +target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS}) diff --git a/example/ck_tile/15_fused_moe/README.md b/example/ck_tile/15_fused_moe/README.md new file mode 100644 index 0000000000..dd566c1667 --- /dev/null +++ b/example/ck_tile/15_fused_moe/README.md @@ -0,0 +1,69 @@ +# fused-moe +Implementing the fused-moe block operator using ck-tile. This is a scatter/gather-group-gemm based solution, similiar to that of [vllm moe](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), but we introduce more kernel fusion to boost performance +![](misc/moe-0.png) + +The benifit of this fused-moe: +* 1.5~2x perf boost compared with current vllm solution +* zero workspace to reduce memory footprint +* much less kernel instance, easy to maintain + +# Implementation and feature support +## moe-sorting +this is a common pre-process step before the actual moe-gemm. The purpose is to transform the moe loop over from token-by-token to expert-by-expert, make sure very workgroup is working for a single expert (B matrix). Besides, we extend this op to do the zeroing of the output buffer(to be used for reduce buffer with atomic) + +## moe-gemm +`moe-gemm` is a group-gemm based back-to-back gemm, where the row-id of input token comes from another buffer. Naive understanding of fused-moe is from token-by-token view as below picture: +![](misc/moe-1.png) +After `moe-sorting`, we can view this algorithm as expert-by-expert, as below: +![](misc/moe-2.png) + +## optimization +summary of the key design of this fused-moe operator: +* fuse 2 group-gemm + activation + `topk-weight` multiply into single kernel, using atomic for 2nd gemm accumualation +* fuse buffer-zeroing in `moe-sorgin`, user no longer need call extra torch.zero() for the out buffer +* fused scatter-gather for row index(same as vllm) +* pre-shuffle B matric(weight) to maximize memory throughput. input(activation) keep original layout `[batch, hidden]`. +* extrem optimized pipeline using block-inline-asm(we call it `micro-kernel` or `uk`), while not breaking the *composable* design of ck + +## +``` +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) +``` \ No newline at end of file diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp new file mode 100644 index 0000000000..6bd7688d8a --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moe.hpp @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "fused_moesorting.hpp" +#include "fused_moegemm.hpp" + +struct fused_moe_args +{ + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token (no need to do zeroing) + + const void* topk_ids_ptr; // [tokens, topk] + const void* topk_weight_ptr; // [tokens, topk] + void* sorted_token_ids_ptr; // [max_num_tokens_padded] + void* sorted_weight_ptr; // [max_num_tokens_padded] + void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size] + void* num_sorted_tiles_ptr; // [1] + + ck_tile::index_t block_m; // block_m, used to devide the input + ck_tile::index_t hidden_size; // k + ck_tile::index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + ck_tile::index_t num_tokens; // input number of tokens for current iteration + ck_tile::index_t num_experts; // number of groups + ck_tile::index_t topk; // need this? + + ck_tile::index_t stride_token; // for input/output, stride for each row, should >= hidden_size +}; + +// This is the public API, will be generated by script +struct fused_moe_traits +{ + std::string prec_i; // input precision + std::string prec_w; // weight precision + std::string prec_o; // output precision + std::string prec_st; // token scale data type + std::string prec_sw; // weight scale data type + std::string prec_sq; // smooth quant scale + std::string prec_kw; // topk-weight data type + int block_m; + int gate_only; + int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant +}; + +float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/15_fused_moe/fused_moegemm.hpp b/example/ck_tile/15_fused_moe/fused_moegemm.hpp new file mode 100644 index 0000000000..b8e51475ad --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moegemm.hpp @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/fused_moe.hpp" +#include + +// this is only a convenient structure for creating an example +// this is not part of the host API +template +struct FusedMoeGemmTypeConfig; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::bf16_t; + using GDataType = ck_tile::bf16_t; + using DDataType = ck_tile::bf16_t; + using AccDataType = float; + using ODataType = ck_tile::bf16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::fp16_t; + using GDataType = ck_tile::fp16_t; + using DDataType = ck_tile::fp16_t; + using AccDataType = float; + using ODataType = ck_tile::fp16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +template +struct FusedMoeGemmTypeConfig +{ + using ADataType = ck_tile::int8_t; + using GDataType = ck_tile::int8_t; + using DDataType = ck_tile::int8_t; + using AccDataType = int32_t; + using ODataType = ck_tile::bf16_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::index_t; +}; + +// runtime args +struct fused_moegemm_args : public ck_tile::FusedMoeGemmHostArgs +{ +}; + +// This is the public API, will be generated by script +struct fused_moegemm_traits +{ + std::string prec_i; // input precision + std::string prec_w; // weight precision + std::string prec_o; // output precision + std::string prec_st; // token scale data type + std::string prec_sw; // weight scale data type + std::string prec_sq; // smooth quant scale + std::string prec_kw; // topk-weight data type + int block_m; + int gate_only; + int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant +}; + +float fused_moegemm(fused_moegemm_traits, fused_moegemm_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/15_fused_moe/fused_moesorting.hpp b/example/ck_tile/15_fused_moe/fused_moesorting.hpp new file mode 100644 index 0000000000..57dace9b41 --- /dev/null +++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/fused_moe.hpp" + +struct fused_moesorting_trait +{ + std::string index_type; + std::string weight_type; // currently always float +}; + +struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs +{ +}; + +float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s); diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp new file mode 100644 index 0000000000..bfc0ce4096 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fused_moe.hpp" + +float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s) +{ + auto s_sub = ck_tile::stream_config{s.stream_id_, false, s.log_level_, 0, 1}; + + auto o_data_bytes = [&]() { + if(t.prec_o == "fp32") + return 4; + else if(t.prec_o == "fp16" || t.prec_o == "bf16") + return 2; + else if(t.prec_o == "int8" || t.prec_o == "fp8") + return 1; + return 1; + }(); + + auto t0 = fused_moesorting_trait{"int32", "fp32"}; + auto a0 = fused_moesorting_args{ + a.topk_ids_ptr, // const void* p_topk_ids; + a.topk_weight_ptr, // const void* p_weights; + a.sorted_token_ids_ptr, // void* p_sorted_token_ids; + a.sorted_weight_ptr, // void* p_sorted_weights; + a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids; + a.num_sorted_tiles_ptr, // void* p_total_tokens_post_pad; + a.o_ptr, // void* p_moe_buf; + a.num_tokens, // index_t tokens; + a.block_m, // index_t unit_size; + a.num_experts, // index_t num_experts; + a.topk, // index_t topk; + a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes; + }; + + auto t1 = fused_moegemm_traits{t.prec_i, + t.prec_w, + t.prec_o, + t.prec_st, + t.prec_sw, + t.prec_sq, + t.prec_kw, + t.block_m, + t.gate_only, + t.fused_quant}; + auto a1 = fused_moegemm_args{ + a.a_ptr, // const void* a_ptr; + a.a_scale_ptr, // const void* a_scale_ptr; + a.g_ptr, // const void* g_ptr; + a.d_ptr, // const void* d_ptr; + a.g_scale_ptr, // const void* g_scale_ptr; + a.d_scale_ptr, // const void* d_scale_ptr; + a.y_smooth_scale_ptr, // const void* y_smooth_scale_ptr; + a.o_ptr, // void* o_ptr; + a.sorted_token_ids_ptr, // const void* sorted_token_ids_ptr; + a.sorted_weight_ptr, // const void* sorted_weight_ptr; + a.sorted_expert_ids_ptr, // const void* sorted_expert_ids_ptr; + a.num_sorted_tiles_ptr, // const void* num_sorted_tiles_ptr; + a.hidden_size, // index_t hidden_size; + a.intermediate_size, // index_t intermediate_size; + a.num_tokens, // index_t num_tokens; + a.num_experts, // index_t num_experts; + a.topk, // index_t topk; + a.stride_token // index_t stride_token; + }; + + float r0 = -1; + float r1 = -1; + + float r = ck_tile::launch_kernel( + s, + [=, &r0](const ck_tile::stream_config&) { r0 = fused_moesorting(t0, a0, s_sub); }, + [=, &r1](const ck_tile::stream_config&) { r1 = fused_moegemm(t1, a1, s_sub); }); + + // keep unsupported case return negative + if(r0 < 0 || r1 < 0) + return -1; + + return r; +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp new file mode 100644 index 0000000000..c1a4c495c3 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" + +// Note: this internal API only declare, not define here, otherwise will block `make -j` +template +float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a); + +template +using S = ck_tile::sequence; + +float fused_moegemm(fused_moegemm_traits t, fused_moegemm_args a, const ck_tile::stream_config& s) +{ + // clang-format off + float r = -1; + if(t.prec_i == "bf16" && t.prec_w == "bf16" && t.prec_o == "bf16" && t.prec_st == "fp32" && + t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1) + { + using t_ = fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0>; + r = fused_moegemm_(s, a); + } + else if(t.prec_i == "fp16" && t.prec_w == "fp16" && t.prec_o == "fp16" && t.prec_st == "fp32" && + t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1) + { + using t_ = fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0>; + r = fused_moegemm_(s, a); + } + // clang-format on + return r; +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp new file mode 100644 index 0000000000..5872179ef7 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "fused_moegemm_api_traits.hpp" +#include "ck_tile/ops/fused_moe.hpp" +#include + +template +using S = ck_tile::sequence; + +// do not the define of this tepmlate function inside the _api.cpp, otherwise will block make -j +template +float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a) +{ + using f_traits = ck_tile::FusedMoeGemmTraits; + using f_shape = ck_tile::FusedMoeGemmShape; + using f_problem = + ck_tile::FusedMoeGemmPipelineProblem; + + // using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmEx; + using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmUk; + using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_Linear; + using f_kernel = ck_tile::FusedMoeGemmKernel; + + const dim3 grids = f_kernel::GridSize(a); + constexpr dim3 blocks = f_kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + static int printed = 0; + + auto kargs = f_kernel::MakeKargs(a); + if(s.log_level_ > 0 && printed == 0) + { + std::cout << ", " << f_kernel::GetName() << std::flush; + printed = 1; + } + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(f_kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp new file mode 100644 index 0000000000..cc476685de --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template + typename WarpPerBlock_, + typename WarpTile_, // seq<*,*,*>, used to select mfma + ck_tile::index_t GateOnly_ = 0, + ck_tile::index_t FusedQuant_ = 0> +struct fmoe_ // traits, ugly name, only used for internal +{ + using TypeConfig = FusedMoeGemmTypeConfig; + + using ADataType = ck_tile::remove_cvref_t; + using GDataType = ck_tile::remove_cvref_t; + using DDataType = ck_tile::remove_cvref_t; + using AccDataType = ck_tile::remove_cvref_t; + using ODataType = ck_tile::remove_cvref_t; + using AScaleDataType = ck_tile::remove_cvref_t; + using GScaleDataType = ck_tile::remove_cvref_t; + using DScaleDataType = ck_tile::remove_cvref_t; + using YSmoothScaleDataType = ck_tile::remove_cvref_t; + using TopkWeightDataType = ck_tile::remove_cvref_t; + using IndexDataType = ck_tile::remove_cvref_t; + + static constexpr ck_tile::index_t BT_ = BlockTIle_::at(ck_tile::number<0>{}); // block token + static constexpr ck_tile::index_t BI_ = + BlockTIle_::at(ck_tile::number<1>{}); // block intermediate + static constexpr ck_tile::index_t BH_ = BlockTIle_::at(ck_tile::number<2>{}); // block hidden + static constexpr ck_tile::index_t BD_ = BlockTIle_::at(ck_tile::number<3>{}); // block down + + using BlockTile_0 = ck_tile::sequence; + using WarpPerBlock_0 = ck_tile::remove_cvref_t; + using WarpTile_0 = ck_tile::remove_cvref_t; + + using BlockTile_1 = ck_tile::sequence; + using WarpPerBlock_1 = ck_tile::remove_cvref_t; + using WarpTile_1 = ck_tile::remove_cvref_t; + + static constexpr ck_tile::index_t GateOnly = GateOnly_; + static constexpr ck_tile::index_t FusedQuant = FusedQuant_; +}; diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp new file mode 100644 index 0000000000..93f9c77869 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" +#include "fused_moegemm_api_internal.hpp" + +// clang-format off +template float fused_moegemm_< + fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0> +>(const ck_tile::stream_config& s, fused_moegemm_args a); + +// clang-format on diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp new file mode 100644 index 0000000000..b8a823e8ed --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "fused_moegemm.hpp" +#include "fused_moegemm_api_traits.hpp" +#include "fused_moegemm_api_internal.hpp" + +// clang-format off +template float fused_moegemm_< + fmoe_, S<1, 4, 1>, S<16, 16, 32>, 1, 0> +>(const ck_tile::stream_config& s, fused_moegemm_args a); + +// clang-format on diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp new file mode 100644 index 0000000000..75aaf86b74 --- /dev/null +++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fused_moesorting.hpp" + +#define MOE_SORTING_DISPATCH(unroll_num_) \ + constexpr ck_tile::index_t unroll_num = unroll_num_; \ + using ms_problem = ck_tile::MoeSortingProblem; \ + using kernel = ck_tile::MoeSortingKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + const auto lds_bytes = kernel::GetSmemSize(a); \ + float ave_time = ck_tile::launch_kernel( \ + s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \ + return ave_time; + +float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s) +{ + if(t.weight_type == "fp32" && t.index_type == "int32") + { + if(a.num_experts > 127) + { + printf("lds size exceed, only support experts <127 \n"); + return -1; + } + if(a.moe_buf_bytes % 16) + { + printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes); + return -1; + } + using index_t = ck_tile::index_t; + using ms_weight_type = float; + index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64); + switch(smem_io_unroll_num) + { + case(1): { + MOE_SORTING_DISPATCH(1); + } + case(2): { + MOE_SORTING_DISPATCH(2); + } + case(3): { + MOE_SORTING_DISPATCH(3); + } + case(5): { + MOE_SORTING_DISPATCH(5); + } + case(6): { + MOE_SORTING_DISPATCH(6); + } + case(7): { + MOE_SORTING_DISPATCH(7); + } + case(8): { + MOE_SORTING_DISPATCH(8); + } + case(9): { + MOE_SORTING_DISPATCH(9); + } + case(10): { + MOE_SORTING_DISPATCH(10); + } + case(11): { + MOE_SORTING_DISPATCH(11); + } + default: { + MOE_SORTING_DISPATCH(4); + } + } + } + return -1; +} diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp new file mode 100644 index 0000000000..2f44f903e9 --- /dev/null +++ b/example/ck_tile/15_fused_moe/main.cpp @@ -0,0 +1,603 @@ +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "fused_moe.hpp" + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +// mfma_type, 0:32x32, 1:16x16 +// TODO: padding? +template +auto shuffle_moe_weight(const ck_tile::HostTensor& t, std::string mfma_dtype, int mfma_type = 0) +{ + assert(t.get_lengths().size() == 3); + int b_ = t.get_lengths()[0]; + int n_ = t.get_lengths()[1]; + int k_ = t.get_lengths()[2]; + if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0) + { + ck_tile::HostTensor t_view({b_, n_ / 32, 32, k_ / 16, 2, 8}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1) + { + ck_tile::HostTensor t_view({b_, n_ / 16, 16, k_ / 32, 4, 8}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0) + { + ck_tile::HostTensor t_view({b_, n_ / 32, 32, k_ / 32, 2, 16}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1) + { + ck_tile::HostTensor t_view({b_, n_ / 16, 16, k_ / 64, 4, 16}); + std::copy(t.begin(), t.end(), t_view.begin()); + return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5}); + } + return t; +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("t", "128", "num input tokens") + .insert("e", "32", "num of experts") + .insert("k", "5", "topk") + .insert("h", "8192", "hidden_size of this model") + .insert("i", "8192", "intermediate_size between 2 gemms of FFN") + .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size") + .insert("bm", "32", "blocking factor for sorted tokens") + .insert("tp", "8", "tensor parallel size") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec_i", "bf16", "input precision") + .insert("prec_w", "bf16", "weight precision") + .insert("prec_o", "bf16", "output precision") + .insert("prec_st", "auto", "token scale data type. auto will set to fp32") + .insert("prec_sw", "auto", "weight scale data type. auto will set to fp32") + .insert("prec_sq", "auto", "(dynamic) smooth quant data type. auto will set to fp32") + .insert("prec_kw", "auto", "topk-weight data type. auto will set to fp32") + .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant") + .insert( + "gate_only", "1", "w0(gate/up) style, 0:gate+up will double interm size, 1:only gate") + .insert("api", "0", "benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm") + .insert("balance", + "0", + "if set to 1, will try balance the expert in topk-ids(convenient for testing)") + .insert("init", + "2", + "init method. 0:random stepped float(fast). 1: random uniform, 2:rand normalized" + "normalized(slow)") + .insert("seed", "11939", "seed used to do random") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// I:input-type, W:weight-type, O:output-type, ST:toke-scale-tpye, SW:weight-scale-type, +// SQ:smooth-quant-type, KW:topk-weight-type +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t tokens = arg_parser.get_int("t"); + ck_tile::index_t experts = arg_parser.get_int("e"); + ck_tile::index_t topk = arg_parser.get_int("k"); + ck_tile::index_t hidden_size = arg_parser.get_int("h"); + ck_tile::index_t intermediate_size = arg_parser.get_int("i"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + ck_tile::index_t block_m = arg_parser.get_int("bm"); + if(stride < 0) + stride = hidden_size; + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_w = arg_parser.get_str("prec_w"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_st = arg_parser.get_str("prec_st"); + std::string prec_sw = arg_parser.get_str("prec_sw"); + std::string prec_sq = arg_parser.get_str("prec_sq"); + std::string prec_kw = arg_parser.get_str("prec_kw"); + prec_st = (prec_st == "auto") ? "fp32" : prec_st; + prec_sw = (prec_sw == "auto") ? "fp32" : prec_sw; + prec_sq = (prec_sq == "auto") ? "fp32" : prec_sq; + prec_kw = (prec_kw == "auto") ? "fp32" : prec_kw; + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int fused_quant = arg_parser.get_int("fquant"); + int gate_only = arg_parser.get_int("gate_only"); + int api = arg_parser.get_int("api"); + int balance = arg_parser.get_int("balance"); + int tp = arg_parser.get_int("tp"); + int init = arg_parser.get_int("init"); + uint32_t seed = arg_parser.get_uint32("seed"); + + // w0 (Gate+Up or Gate only, N size) + ck_tile::index_t shared_intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2) / tp; + // w1 (Down, N size) + ck_tile::index_t shared_intermediate_size_1 = intermediate_size / tp; + + auto prec_str = [&]() { + auto base_str = prec_i; + if(prec_i != prec_w) + base_str += "x" + prec_w; + if(prec_i != prec_o) + base_str += "=" + prec_o; + if(fused_quant != 0) + { + base_str += std::string("(") + prec_st + "|" + prec_sw + "|" + prec_sq + ")"; + } + return base_str; + }(); + auto api_str = [&]() { + if(api == 0) + return std::string("fmoe"); + else if(api == 1) + return std::string("moeg"); + else if(api == 2) + return std::string("moes"); + return std::string(""); + }(); + + auto stride_str = [&]() { + if(stride == hidden_size) + return std::string(""); + else + return std::string(", st:") + std::to_string(stride); + }(); + + std::cout << "[" << api_str << "|" << prec_str << "]" + << " t:" << tokens << ", e:" << experts << ", k:" << topk << stride_str + << ", hidden:" << hidden_size << ", interm:" << intermediate_size << ", tp:" << tp + << ", shrd_interm:" << shared_intermediate_size_0 << "|" << shared_intermediate_size_1 + << ", go:" << gate_only << ", q:" << fused_quant << std::flush; + + using TypeConfig = FusedMoeGemmTypeConfig; + using ADataType = typename TypeConfig::ADataType; + using GDataType = typename TypeConfig::GDataType; + using DDataType = typename TypeConfig::DDataType; + using AccDataType = typename TypeConfig::AccDataType; + using ODataType = typename TypeConfig::ODataType; + using AScaleDataType = typename TypeConfig::AScaleDataType; + using GScaleDataType = typename TypeConfig::GScaleDataType; + using DScaleDataType = typename TypeConfig::DScaleDataType; + using YSmoothScaleDataType = typename TypeConfig::YSmoothScaleDataType; + using TopkWeightDataType = typename TypeConfig::TopkWeightDataType; + using IndexDataType = typename TypeConfig::IndexDataType; + + // host verify + ck_tile::HostTensor a_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor g_host({experts, shared_intermediate_size_0, hidden_size}); + ck_tile::HostTensor d_host({experts, hidden_size, shared_intermediate_size_1}); + ck_tile::HostTensor o_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor sa_host({tokens}); + ck_tile::HostTensor sg_host({shared_intermediate_size_0}); + ck_tile::HostTensor sd_host({shared_intermediate_size_1}); + ck_tile::HostTensor sy_host({shared_intermediate_size_1}); // smooth-quant + ck_tile::HostTensor topk_ids_host({tokens, topk}); // to be sort + ck_tile::HostTensor topk_weight_host({tokens, topk}); // to be sort + + int max_num_tokens_padded = topk * tokens + experts * block_m - topk; + ck_tile::HostTensor sorted_token_ids_host({max_num_tokens_padded}); + ck_tile::HostTensor sorted_weight_host({max_num_tokens_padded}); + ck_tile::HostTensor sorted_expert_ids_host( + {(max_num_tokens_padded + block_m - 1) / block_m}); + ck_tile::HostTensor num_sorted_tiles_host({1}); + + if(init == 0) + { + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(a_host); + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(g_host); + ck_tile::FillStepRange{.5f, -.5f, -0.01f}(d_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sa_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sg_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sd_host); + ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sy_host); + ck_tile::FillStepRange{-.5f, .5f, 0.01f}(topk_weight_host); + } + else if(init == 1) + { + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(g_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(d_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sa_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sg_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sd_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sy_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}( + topk_weight_host); + } + else if(init == 2) + { + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(a_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(g_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(d_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sa_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sg_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sd_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sy_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(topk_weight_host); + } + + // permute weight + ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); + ck_tile::HostTensor d_perm_host = shuffle_moe_weight(d_host, prec_w, 1); + + // do moe sorting + if(balance) + { + int e_cnt = 0; + for(int i = 0; i < static_cast(topk_ids_host.mData.size()); i++) + { + topk_ids_host.mData[i] = e_cnt; + e_cnt++; + if(e_cnt >= experts) + e_cnt = 0; + } + } + else + { + topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11913); + } + +// leave it here for future debug purpose +#if 0 + a_host.loadtxt("../../ater/input_torch.txt"); + + topk_ids_host.loadtxt("../../ater/topk_ids_torch.txt", "int"); + // topk_ids_host.savetxt("topk_ids_2.txt"); + topk_weight_host.loadtxt("../../ater/topk_weights_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + + g_host.loadtxt("../../ater/w1_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + d_host.loadtxt("../../ater/w2_torch.txt", "float"); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + + ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; + ck_tile::HostTensor d_perm_host = shuffle_moe_weight(d_host, prec_w, 1); + std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; +#endif + +#if 0 + std::cout << "sorted_token_ids_host:" << sorted_token_ids_host << std::endl; + std::cout << "num_sorted_tiles_host:" << num_sorted_tiles_host << std::endl; + std::cout << "sorted_expert_ids_host:" << sorted_expert_ids_host << std::endl; + std::cout << "topk_weight_host:" << topk_weight_host << std::endl; + std::cout << "sorted_weight_host:" << sorted_weight_host << std::endl; +#endif + auto cal_tflops = [&](auto ms) { + double flop_gemm_0 = + 2 * static_cast(tokens) * topk * shared_intermediate_size_0 * hidden_size; + double flop_gemm_1 = + 2 * static_cast(tokens) * topk * shared_intermediate_size_1 * hidden_size; + return (flop_gemm_0 + flop_gemm_1) / (static_cast(ms) * 1e-3) / 1e12; + }; + + // TODO: this method we use expert-by-expert view, just for reference + auto cal_tbps = [&](auto ms) { + double token_bytes = + static_cast(tokens) * topk / experts * hidden_size * sizeof(ADataType); + double w0_bytes = static_cast(shared_intermediate_size_0) * experts * hidden_size * + sizeof(GDataType); + double w1_bytes = static_cast(shared_intermediate_size_1) * experts * hidden_size * + sizeof(DDataType); + double o_bytes = + static_cast(tokens) * topk / experts * hidden_size * sizeof(ODataType); + double topk_weights_bytes = static_cast(tokens) * topk * sizeof(TopkWeightDataType); + // ignore index, they are too small + + return (token_bytes + w0_bytes + w1_bytes + o_bytes + topk_weights_bytes) / + (static_cast(ms) * 1e-3) / 1e12; + }; + + if(api == 0) + { + ck_tile::DeviceMem a_buf(a_host); + ck_tile::DeviceMem g_perm_buf(g_perm_host); + ck_tile::DeviceMem d_perm_buf(d_perm_host); + ck_tile::DeviceMem sa_buf(sa_host); + ck_tile::DeviceMem sg_buf(sg_host); + ck_tile::DeviceMem sd_buf(sd_host); + ck_tile::DeviceMem sy_buf(sy_host); + ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes()); + + ck_tile::DeviceMem topk_ids_buf(topk_ids_host); + ck_tile::DeviceMem topk_weight_buf(topk_weight_host); + + ck_tile::DeviceMem sorted_token_ids_buf( + sorted_token_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem sorted_expert_ids_buf( + sorted_expert_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem num_sorted_tiles_buf( + num_sorted_tiles_host.get_element_space_size_in_bytes()); + + fused_moe_traits traits{prec_i, + prec_w, + prec_o, + prec_st, + prec_sw, + prec_sq, + prec_kw, + block_m, + gate_only, + fused_quant}; + + fused_moe_args args{a_buf.GetDeviceBuffer(), + fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr, + g_perm_buf.GetDeviceBuffer(), + d_perm_buf.GetDeviceBuffer(), + fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr, + fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr, + fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr, + o_buf.GetDeviceBuffer(), + topk_ids_buf.GetDeviceBuffer(), + topk_weight_buf.GetDeviceBuffer(), + sorted_token_ids_buf.GetDeviceBuffer(), + sorted_weight_buf.GetDeviceBuffer(), + sorted_expert_ids_buf.GetDeviceBuffer(), + num_sorted_tiles_buf.GetDeviceBuffer(), + block_m, + hidden_size, + shared_intermediate_size_0, + tokens, + experts, + topk, + stride}; + float ave_time = fused_moe( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + if(ave_time < 0) + { + std::cout << " not supported!" << std::endl << std::flush; + return false; + } + + // float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, " + << cal_tbps(ave_time) << " TB/s" << std::flush; + bool pass = true; + + if(do_validation) + { + ck_tile::reference_moe_sorting( + topk_ids_host, + topk_weight_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host.mData[0], + experts, + block_m); + + ck_tile::reference_fused_moe( + a_host, + g_host, + d_host, + sa_host, + sg_host, + sd_host, + sy_host, + o_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host, + topk_ids_host, + block_m, + tokens, + experts, + hidden_size, + shared_intermediate_size_0, + topk, + gate_only); + + auto o_dev = o_buf.ToHost(); + // o_dev.savetxt("gpu-out.txt", "float"); + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err( + o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol); + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + } + std::cout << std::flush << std::endl; + return pass; + } + else if(api == 1) + { + ck_tile::reference_moe_sorting( + topk_ids_host, + topk_weight_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host.mData[0], + experts, + block_m); + + // done, preparing GPU buffer + ck_tile::DeviceMem a_buf(a_host); + ck_tile::DeviceMem g_perm_buf(g_perm_host); + ck_tile::DeviceMem d_perm_buf(d_perm_host); + ck_tile::DeviceMem sa_buf(sa_host); + ck_tile::DeviceMem sg_buf(sg_host); + ck_tile::DeviceMem sd_buf(sd_host); + ck_tile::DeviceMem sy_buf(sy_host); + ck_tile::DeviceMem o_buf(o_host); + + // manually clear output buffer for atomic + o_buf.SetZero(); + // + + ck_tile::DeviceMem sorted_token_ids_buf(sorted_token_ids_host); + ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host); + ck_tile::DeviceMem sorted_expert_ids_buf(sorted_expert_ids_host); + ck_tile::DeviceMem num_sorted_tiles_buf(num_sorted_tiles_host); + + fused_moegemm_traits traits{prec_i, + prec_w, + prec_o, + prec_st, + prec_sw, + prec_sq, + prec_kw, + block_m, + gate_only, + fused_quant}; + + fused_moegemm_args args{a_buf.GetDeviceBuffer(), + fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr, + g_perm_buf.GetDeviceBuffer(), + d_perm_buf.GetDeviceBuffer(), + fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr, + fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr, + fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr, + o_buf.GetDeviceBuffer(), + sorted_token_ids_buf.GetDeviceBuffer(), + sorted_weight_buf.GetDeviceBuffer(), + sorted_expert_ids_buf.GetDeviceBuffer(), + num_sorted_tiles_buf.GetDeviceBuffer(), + hidden_size, + shared_intermediate_size_0, + tokens, + experts, + topk, + stride}; + + float ave_time = fused_moegemm( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + if(ave_time < 0) + { + std::cout << " not supported!" << std::endl << std::flush; + return false; + } + + // float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, " + << cal_tbps(ave_time) << " TB/s" << std::flush; + bool pass = true; + + if(do_validation) + { + ck_tile::reference_fused_moe( + a_host, + g_host, + d_host, + sa_host, + sg_host, + sd_host, + sy_host, + o_host, + sorted_token_ids_host, + sorted_weight_host, + sorted_expert_ids_host, + num_sorted_tiles_host, + topk_ids_host, + block_m, + tokens, + experts, + hidden_size, + shared_intermediate_size_0, + topk, + gate_only); + + auto o_dev = o_buf.ToHost(); + // o_dev.savetxt("gpu-out.txt", "float"); + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err( + o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol); + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; + } + std::cout << std::flush << std::endl; + + return pass; + } + return false; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_w = arg_parser.get_str("prec_w"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_st = arg_parser.get_str("prec_st"); + std::string prec_sw = arg_parser.get_str("prec_sw"); + std::string prec_sq = arg_parser.get_str("prec_sq"); + std::string prec_kw = arg_parser.get_str("prec_kw"); + prec_st = (prec_st == "auto") ? "fp32" : prec_st; + prec_sw = (prec_sw == "auto") ? "fp32" : prec_sw; + prec_sq = (prec_sq == "auto") ? "fp32" : prec_sq; + prec_kw = (prec_kw == "auto") ? "fp32" : prec_kw; + + // no dynamic quant case + if(prec_i == "bf16" && prec_w == "bf16" && prec_o == "bf16" && prec_kw == "fp32") + { + return run( + arg_parser) + ? 0 + : -2; + } + else if(prec_i == "fp16" && prec_w == "fp16" && prec_o == "fp16" && prec_kw == "fp32") + { + return run( + arg_parser) + ? 0 + : -2; + } + + return -3; +} diff --git a/example/ck_tile/15_fused_moe/misc/moe-0.png b/example/ck_tile/15_fused_moe/misc/moe-0.png new file mode 100644 index 0000000000000000000000000000000000000000..aed1964f2802c4e7f65d7080f338309c8c2287a6 GIT binary patch literal 76830 zcmdSA^;=t8)HPZ>c+lbyytuZwJ4K7TLveS4yB065#ibN0?ouoicXxM(FXx>1yZ8PH z@BM-2A$fMP*Pd&vF~=NpCQ?~Z8XbiQ<=wk?=&~{rs_)*xmO=mA0kF__u9#x1-@SW# zla&xv^ISa6M#{pKTpVci+1!1{gbGLu-liw$0?@7Nd&Bj__e_2b~xSX|8}W3GCaJ{=5;lm&A-v1l8B6q{QdiPyT$tM&Yc&t zkyPfZ-Lc?7)*4ku(xxRrv!R6aeG_BjjX6XA*!pbm!!l;w#+bkQzCvqHXPs{cCAo0~ z?3)9`4wWyqvmDTs|FWD|?eOE5YjD{c-*uXw2tmf#h2F{iSZmsIsK!XMc7AzT4|i_8 zJ6N~g3ajFp3VrH_LT~4L15e{$7GowUF^9D_bJc79Z=@%uiSjJ6pFhXAIa{Y6A0LOJ zzVERC1q%lE@kwqIL`2=D(XnY5u_*UxS-uf?p0(dqcfLMK#S(Rs-|SCrS%HJ<+V^sN z47?9$e1-0oT`qc~Cdhf6x24JbLNI}J7a;hWxu?_Cv)8-Mx0PDs-qV%Vv8@Qn_2miv zv$jn8mJd`^RL|iK?Kk)xi~wku>$))RRwxma|>Pd9}5* z=~mj#4O2V#Cr2oaO#l|^t@2|qw%M&@(;69E-J zLfH5tJ6a+z==x}m6$tQs*hxz>@cvqvs6yItzZpHk+DStG(QYwhUto5)OcppBL&&+C z>Cl1Y-_zc9F+luN5tkRSp!8**vdMl$2SqOLM)w(c!?4b5_@elQ-2eF^MbluRTA$yu zpz!^W|Laxm+v8kk-`;Q%-J17N)lN3A%kJo}aXxw9xcLP4<9Vks?z!jNC5KI=h++b; z$R~`Peu4YLiW;#EG4NCF+w)$_QMLHbE%<3vp^&TqDsyEzw6 z+>5Si{9kO`SVHY7#Y)vHGz4F+r$ELM1_rAu9&51#7;pbPRzvQXj>SZdpr)P^!ejP? zz}4v5+pEVFPuoW(!a+l?os=c})-i!Nt&O@77AQtkwgzI2O97POXoQp*=L50CSpA#f zICP2vC-qa-bT6e6^$}3$Q;3MBV;^l+qOAGdt*pJ?ehN8`us8i%S1?$rQn(6!vTIt3 z-3P{cbOpfRp{XbG$>27SpTdpPY!bY)_vSK%bBj8(>KwBOZf>-mLFtU;u~1f4X0w7& zxVrY`ZY*|!Ummfjq#_29=;xv6_~rL>yW6zi|GM_}vUW8g%qsDcW(Y?9=5KX4qiIM= zPL645CPm;DPBab1?U$v?uJ?a?z1SLrg8F5HE-ylv^yj#fopR1?nkEMHxP)PC_D+|5 znGk;SJ<^1pzt;VNX!o;K^r!CdMOop7&T&}z=IumzAnmH#oW61%s&wPA&eS`86 z{pc4ZLGDepFYuq)7}Ve^Q8mJIw>CEdb?og=k;r{(eIJ|w44W~}&(DV72lMeP-sTPh zI_d9fIx+Xd(1(bkKM5nYmywf0=@h~a_SHuEdX591SK9H7N_taix(B+f7Kb$GVOCa5$tKqWRXxM!L+Ei0#X4R7jk7`ZN+Rg(DkcUSLyj09^zwW)@ofVy@9`B}GM9(6CxzMWiV4pVVL0 zlgFX%75-I$1>CoIoW*@qy&wgtu}NO}*2^XPd(mxm2*2J_F#!hxu zf>f21l)||-ADep&UoX?3UxF9NRH@&33&|H}184dF`B0MO*n^ysidQZbLrXb3_yFbO zHQy`rXFU7qLg{#U)^@HwT^-SPNJ#-V1@9*7l;g3GoRvwx{P-N06hhqny;l8x0?Oh@ zjZsqY@&R<^& zcWI~`HzS;eiir$|XCcmAtooeg>*v8c!WN-a6W>H*mKt)A&?|a{q7mbOF_{r=mw%c{ zXBn>Q+2YpFGd{~D;~l?Q`O0ud{=6-`R#7q;;%nnaO@p6^u$DrtA5RVx?Rq&`Z6}5` zB9p-C70U5B{hbhGv_wLL+pSEPPMhNlLm4v%X(caKJkTR6 zZWN0eNBl_pt}M9|l&XJZM3`o1 zAs1$W^9GQm^VzxRquvw9sA&z%|JSyiN`OO*4_7ECK0!C{;cbijP2}n)A56WC1OaQ? zx!C>n{QDrSk|{aSsrI{7A1s1UOBlqk(alE-ZwL#-G!Q8~3}m>mi1|U%8uUCH983IZ zpoYwQKR-$y%_Lr#;4vyuQyot>NFigb;6wf5r{B~+X@BDoHb#==hPiUR(o`@jr%rj7Y^Zz=h;l=K6gs z;?0XK)VWAXNb27v?xURL)nX&VrFeAMAdM%Yh9QF5yn!ZbU{UAcM%aJ@IaE9|@%n$lqQ z&Q@CA8!BY;&FFNnU5v1_qBX>oi)}|3Zg3uwq2qPnot9ZI#w#KoBci~VN}8;PK>+>8 zP?;;#2KR`=1G9lzjb;Ow1p7w60yp)RTmEu;)@qv7LWRzLHAt@^mD&uMxje|R0ah7< zo<$9Q87BlD3z6t!)J|DR4)%sHQAxua3S+o(%i`a;pcO4Uj(>X zB2f7|RTP9!tscA{YM`V-jsW3hB#v^Pe6AiFp`lWd&Jf_ zk@g^;jSdSYrz8v24D>9-{>ETTY;s*4Bqm<1f=bVqj2QBTNmT6f`K+!3Xk5f4Kv-gM z_!-$5#R1tov^x)C#Cy)bs8QMVV3kNuxV*@B88r}18w04Rtp(CigqCVw8|XCI#{!XP zLuLcZyRqtFub))POZlIHq{Q@}>cZ5zM1WIheC59jZ-iQAYYls>m`zj978bPOt zq0TR+Fr+702AD^LbhCB~h>c7CSf_X7E`4_y?JSZ4vZ41)#)BoL#{3JcgO!BezkjzD zeNC}Yio1;}6%-6&CQ6f(TW?los?}9PISuOTb?dcVAHL|_ANGeTNq1|KwfCJs94Ec(fy-%R5TnjYSV!y;Yja$ zS0YX~Rv5Asg5^Qd0D0rI0`9295UB!qWjV+F*|=8RYt(bVIqim@2aXy{RtD~Mkp{W3 zP<*Y>L$FQosa&@g+H4ME^TTfDWEaHyLL4`mM~iBaHLmxmFwRkwhZUecq(o`mSxfIL zk`(-$KBz+4jL;*h1&cNyU~k4yhA~ympf9Ivegc*KD3o|MfqC%jzbRB$+pn=7 zMRwidRG$!b^@MpOu0twigIq-qF3zC3Rctq!qGZ3i+uj&7I~)vfbTU*sf)yaS*M61q%OhM1QmF= zM4!hG*(pLOSpwZa=|H4OIPup%GCb5T>V6*nClwQlu;|?HFC~gQctqPsYnyB5u06#RxZ(<+5_+#+ z6#@04+#C2nf_$T~D=er&yjQI^@llF zFp6j)qW`kTfO(LH1!jnkV0}s-G~PSM1M19oH9Kvsw^n95JTt!kDdq+G>U6!sSW0-*gw-F&-fGnrEM<8}+68pMzCgcFBpfQBCiz;fbRw2Qdy#Q^5^ zRBfs5n`wGW4}qLdoPX!NIEGrtMsb7B5AGR?{yLF(gyV(PJ2J{J>tLXupszlv;Wb3& z;P8VWQ1hPY2J`yM4YI#>7 zireQKy^=K}D1R~4MaWdrsib0nND3$M_c&U#*Kqg%2VGHab?GOkorHcOHXYZ@JjdDm zd(#;(r%5nRs6^mCMF9yUPb^HNOI=exlrq$LG>do2o8%yTFl0}%qAahSqFR#A_-E<2 z1j){Q>u6p^C`oP>$h^Gc4$6+QP2F!ch zR}v?Nz(Y>OTEdh`;eq$HR+E(1VO0+(bJWAw9aJLhnT-%|GeYwhEtMf9$Zu&-?+zGR zqRsAkK-ko17Lyy=(4Zr0D`$15S#6 z{OEBWl|>vPMME?v6T%NYUgs);3H(Ngkg@1s_l4FmC8AS&z~mcQ5ejYWEi6rx){sSg z$UWL{*q#J{SO}l7=l%7M1cW0B4Rq|xcE7wG6Bcp2dUA8RawH*uxt*UkI&z>?Pf}F+ zyKbg!xnr6D`o2dI1pgkJ-f3V^x2psNtrSL49$Cnpn$ehvYqhoM>yO*QZjlic2`lP| zkB+QGU$!X=ZhOtX32avc5x#0byh4XtR_r~u9Zzc3!$D&_@b`XLSX7Uq=`p+3-X! ztHwFCFraI5D*Y% z4X8#D-J|~!2zqmMa43aUclG9d0Ch=XeV^=vfJ6G}&(F1^cD{C|+t~hEsJvpR?8MHw zeYsnc&W{W>85L${SGmR)15?(M(}sKwoiF<8S=yI?TkwRTF6A2fZhSco_HUdHJqt@1 zdx)#tMhCoCB_isEH3hi|3y~Q6G&dKSJ_WoPXs>r@a!Yc!3m{qf zE*Pu0-TTJ+Ctz8C5Dcdo@XhsP5pPcT2k$RwB`7n6@;Gfu@VHhu$#;Lq+r5{O!l7_2@|uFn5Dxqpp81V$tE$%Olx_Hk4D$ftb=#C>mQ5CP{1)v;^X&WI(vp zbb=suhn7$Iti4sI7AY?OGmALKWS)gR{ zdJWUzerxmz@I-K#_~#q{xD|$as>;g=SL9yX-%4`4QOtu@e*?lO|AEtbehrXJJ%V`~ zvKN5#{)*B@`U!>smX!8_a3KCtqGNq9UkPuieBPLiiV7|gbBzOVO15TUZm!)xZf@`u z9?g_Z@WUaIXlDAbdUqib3>V%Q`CkI3ZOpEdt_OX_V{`Q*OO=MPxOJQk(I*P0oEnzNdWTlwx);UWM+tWbiN z<7hg2go}cF=!fX+mA-U!8O3ih1UAw5%;z{fOundtDAC6&t#<{I(LG5-9UbH;=*A+O z;^X!9vDhk4#_Co}#GmYKA$c%n6TUhmDo;_S93e31dvlaJAE3W@D%*{@83Wk_M&>$lIRqe|LJ zp;qUEq}79pwQxvPP~>;+>KEacYn)1qpG++DQXJL_PsK(`7zn7Jto1J)-(K$GN%=pT z;RUBa?Ykc7EqERt7uQAV%|~$l%h#`8&krD%3H11}MAC^#Nn_vaWX_=VAYw_(w)&#G zBdaGSm(o4{FH>ttv{&uH^Zp^uQ8;|_?9WCznnRD^FHiYq01wA9$Ig&})hgYlgh&pJ z&nMw&fU~c-3Sbph#yEbfK2UXM`N!=@}Fz0$*oWg1p#ASbyFKzSY zr$1E|*7Zo6g$i_AC~NHIsf}T8af8)o!5`+Mlp#c`IMq%vOFE^l41D4?ag&{6LK~1A`oBD%OkY zuO6c|>y;Q}gCwzxBYyHeK7ryd{Zhn50}#^;{q(ayZ$;)}{9iIYSNg}7<~+#c;xt&L zEdhR>chuqW`oP4UI*Q0xbSMiGhaCoeA^bKx{!VHdd|`x99+^xs<^y1b`Wv2>)MPhg zPLJR4uG8~f*yQUwBF;L2uS8)j;1}-;0Gb6Nae$K;Rj^C&S{Ld0GX=g+bL1u!1`8-kX>SCX zjAU3@`QN*DVg>Sab5jME8sav)-Gt!5pE`Db%tSZ#z6Zw*(MAcH#pZfmBc`Fte@v^* z=*rD|;32`dx**qsQ4PwFjMaSP5UIDCM#zdR%K%YGm-3jQR2h(nVv@xPLD{&U1;0m2 z)Cpv`Df{c7J47i!S_j&JxiyH+Lp>KT`6TYXwjS65f9ZbxCG zy5v5YUEw|MRg0LMl1}3KJw9%#=Ldk}RWt@;97ip{AhbtxH*Xprn#CmMsYZM#(zY?3 zB}w2k<;*^>_3|5x2kpG{(mP3Wx$V<<=g!{!)CsK8bDL35AmOneN@;kn>UZ*f!nLOL zl#ig@jW0$|(4V*HDI{)j<;r|YZ0P&wS!;HU;;=STDc)oiSmXJU^1aAU~`-M8Jf)08`^>y5by+_ zMO(Et&}hVoM!mi@6uv|pD)aAWg|Nze3*wi`g9}Kn;ag-)vFi!z3$b0Gj#0RF89fN8 zB}x8RP#uxfEcvkYNdK0wJcG+V3x0Pc_8|{0#G4})+ny+tK5Rwo@r5nM9M8g2ZaW4r?wd0xqFC z%LhV_@aZJT`v%%{Ei#%6517NzLDgiuO=DxfyX{8C@@Yy5QYKy5KhMT2T2Ey%IVQE! zEkb$4Wk5|4`wZk*`Ac#xjmlL55Ge*eSZCn>jo0o3x7R);C=+!9`#!hr@<=>QZisM( zNyd^162lAa|1Cc6eh0ofNnNyNl_qDJ5ZUO$T|=aEuYGF9Vf}m{i?+&`gi~0-8?awiR0%oKa?-)imnO z?+sDLwVJj%(BPFu?!rD#;r>zp7f|Q75&B*%RU;GAswyMue5WW$5xZtTG)DUdxg?*V zzX0v%@fF(l%JzSKfTp-!N#)&HZ;9%EoPIs#f>K=N9xiuLE+uOGniq9YxrU2I7qoe+;((7QF2K1-O!^FtTZcsPTK8 z`U(PCh$4_j2^rZMl9Yr9-%HSR$M!Tr?k6Jy6(WXdK{bf)xv3Fu&@d{re_7l!Tcc`* zIZ~d-r`%SQqs-uvYU2)RTV^8|BFc;1bhlS)|3YL|uoXq|3B?Z+zzplkYfgB#Zx8Kt zSNYhX^cC_D*n2tCq|(ZDzaT41wENsi?O`*(&R{u(&7iJZ??dut6V!0|`v{r_6eim? zRP(!mwp?UKE$Ykf`=kGeIf;6{-}){x6#5{4sh)I+BpaG41Yq(1>ZNR;*b79R=SN3oW8QWs}qbh`Yz(8Cz zpr1{Md9%gQY?U!56qy0Lk+hM%k?k?c>y(PvtAw8$O;S|v~_+LZBx$fkV zEmzyT3eHfEg7l#S8gRdZcPOlvo1F?ht3-y3;8f$t1S%1JQXLZg2%0j=b%dB`3sce^ zVL*OZY-yg)u_1dt{C3&f^nFJ05^b7rWo5%2LS%UVC(wvU4h;^-$}OVUf15#1e*{U3 za4lXhH|-NCRsP9Oe2k<)JzigZmKOArh>}=~_Ej<;G*>D;&r50B*9R4#`{Bt=r0r(k zTf-+xKm0th)6-jq>WgnOYDu`b4o9mRyJSK<=-n(abkO9u5Xc{@yb(0(#8XxdC9sp; zSj=WpS6mO#k63+$Toz*)2v_2EI&{1(Z2}zZTOt!BbD?TbS^1q|04@3m>cSq3d%&Z} zj9j-nYP2+34R%~ah7_mB72}5wq9_~pMnR77bUB`shlN3!0U;FbXMv?%oobJV zT0DLPFj+jGaZcEt;jPHpgmEc1CqM}a#?ATXJJTk6>rrQMxrLPsT4|xtd4zeTDq%o3 zGo;*p_ZJ=vl_ow39@cHKN)>!eZcwQe;-*;2z*Oz{I6iAS=Nkny@0Uv#FLc8f$HHeALm&(;up^t~z8$_?g zW8xAS>$y*b;4y?mFF1sa0_30?_|EuI;NC+|baWo-i2aTf493y{;Y;)7D}kaVzY(Yv z1KhuPD+Q;q;jSf~)u*vA4? z$mUd*kb-)Tk575Kf;}gVzy1kGlLuDmiBMM204ceDjtw~wqi}j3qSu`K8`u6Bm7;HxO}a5L8t{8Mw~JPZ7@-&sqt1ze3HoAaWzI?vEElk=E2hh zU;bQOS*FlKu^+*FqlytlIrT(*PxQ(MCqYnvA3~FP8KNuU>m{Lpg$G4*-g?3*utb|I z0lsh73hlhS7EcSWN42l#TrVP8r7nU+{B~Y!5Ecn5AiBvp-!yO*`?c{C40f;;YD2S0 zKT0!oL^n;Ep7V^lX38gjpdE4;f=%%62u+#pqkKa^MSrXj(gr6IRZ!hO9pjB|<0-3& z6l-B?Qbj^`?o+K*d~hxGMLDAtTnruyoB@G8xfwZ@iC0CsQQkQMo9O+I2=2I}JS|9K z?;eA0H)%e(l?qUPnhDy?A#=m!KwkUIpeOA5o+jQTJXjnpE=hd#k7?=*v}_G)gd&|z z>o}cp6~MyxZHVI#3j=pS!WZsafJ4w>JQ25C+tjExce74CN%HSf`y%ZaOi{Uwbmf~} z`9TFoS|*@Ws1Zr7%H)1r7k@av&`)z#j?i~pzHZ#rF>=ymppz6uiSXKP!tR&uu#GX} z)FBKrWSs~xDO+QG7$)dQ1UzcslH_%;i*gCu7IeZxES1upCgd_iFTXgiFxV&mSDc&# zx!>V*S%gaEIKptiFL5NS;`eoo-@oga*$pSsG=H$3YI#R$2*MEW68wH{aM9b#@`={bZf_YUjbt7MxEDJ^&4oD^~)eBV*I3e$Oh#M6i_Aow=AJQv!x zKc#VZ4R4hWls+~n4U&?8An!=E(XUzsEFq7uo*C>4_&@8#_-w#kh8m$tG3Ja(K7&|LI|GxxvDVWz zqh`hXhAQ!kjI_MHK5e`tlsO7V`iN3ZvQ0k3@!a9i<$cJ1D1@QSj~mZ;eCboC^~URsGTLw=y95dQ&a#xax2&L2!nhV8Kfa z;(*AW;C%XV{xg9-0|0Prc)HZ%8ZKWHa(|A@c#h`?+syNaZu_(u6_*r@IS|Pxe*AQM z;?s|(DWP}Hgs}Im>o3H8T`ix7%?PBlf%DELwbV)&R5ZZ8_aJUjz;%;G@?NcNAU>fm za8YUKLbU(rN%jT$Z#iZ|Zo(FvwJ?_Qlk83YEsnDHNJhNO&M^ zUM@9r{9c!47a9Cwad_6kpg|<)To>M>7qrnqEDZMbau}ex2@DDf`e`}{ohV&_PVLQj z2_ltE{BNCt{r|c55`z5Alw+pBM6Oai9~+F3L}Z zDF0m+yhyHI`tr{>Gs*O8sn`i!2vBt|?^gPstqw`xLuwE%|92;+c}rhZLjSw1=t(@o zG*}D(a>KCpSA$~bfB9F&fiXYpbnd;Y_y0rMTSGF>IXnFDOZ1;d#rBbF6LJvwVkVnf zROIQghD2+(>{Hu!l`OSG<)z*$Mxf~K6 zU$c9Cn!e&6nY|MJw~siUaDh3dhO0@LXZ1@}gWUSkdv7_>ivDKz3qKXNa^e5o!u5)UtdjS575?rSMkL?wfKf&SS4?LW$s^A+psT(X-FMl)%#zJEmW zr@U3ajWVSe0+OTY7q9NBcE;GByo*p%`%R@E0|E;2MKXIJyW%Az(8sOVc z*g;&mp&Hx_aq0iWL{&bS=}GbuSKjwUAdxxo#damPGi%c`a{|!u?x$RG^*J^^k{NE2|l>0=RTMI)!m;X|wck-y=i181AtdXsS%Mkv(C_L{rN$1~N zy8-^Uycv;mdz8UY#_-yic!eveah%9B*g_pA7~Dcz1|thdtsH7B9770E7TX|UfyumB zaz4$s;nC*z?X$qtNF_P=j2BAAbXLb9?VZ5ZuVLSF&Pautq4r^C<&J(N>Yq(;3#b1o z>?)&Zzdd9N_ce{d@gPcpaTMctfkJ#Wj`1PyHo6kPtMRJ|_RQ5|k=^&v+ zyuMy|y@}gq&vvE4BtM`A^H^_dtz%K93p(?jQBM8Q(M?z$V^?g9e_i(#wIJ}1|Nb4H z5|Ne7{AU(M7;ej4hn>$0rve4=u7y-*R^*dzmC{a zwQfR2(*E>4)W&(E`0>w`Y3rIjM8LeGLY4hGj!{BieJ_`EqAeuOD4^!O#(?|&AK)Bf zs7!)+oV1-A4~?ZeJ2!D?hMVl;yd5=v2d}!}h`YM!TG(i28c@DJTvg~$h(>#4`SJ{KTsMnyT0XtjRdGnp)h8AUALSGSZ~4@qje#$CnnA5cgwEPEg7}$Hfi)70AAnRy(i9^tC6Fe! zP}^yWe-<6*Ur_i`o|t}k=o?1jdyW=(XQy9Ax=0fWXwzH7Om$;thZuV6{nX}%6dR@} zlqK25^_Fp$^boFP;4I^Ap*&s-Eo+0FNU@d^3u9I538{I^CRO^$ zZEYNPZu1-InO`%%@>=8_5>S>NJ(EAfjg7Tnm;9RLZ3dw1h`eRoc`WcVvt3sA;;QTQ zB(O@p`1~A!&L*RpXhv;kkK-TOEEAWIe44aVusc5FkO72hB%kREj_k)ssbVagKV-CT zAji4CKyXsbbt!yG{Ci(0VT%P_&C^^m67I?Ng<%5d)XeW)u+&2bR$2pA>T#t;ij38S0m zTq4YNmXu?{?Pk%Y0v^ol{NB6ZiUKmfg*-Ibv%q z4>~Fzg&50XvrnEfd2j4=icVK3m+bOU&x07>tq6wGJDF!c2@=0#z9&Yo5=fKva1(BI z-7hHW`gJdD+8UI=L}6X4$q~6x8uMRt!b`0>i<+Md^57+>fx88;4Lq8Sia5c|2q_c$ zJ#q=){*^aX-cr7dHC>J~;(iN8R8+LsCd)<)11(fr!;Q7ytDXYnY1@as-d~&7ukIU} zr<^g@>ExWLq+5leOgMAi(r~>~-zaT}z*h1OyT`6_Y|0`zs4q*YM>%2ufyZm0X8*y* zIG-qp-=LGYckGQD=?Qads$3zF2sM_To@LXguc!wN4BINz{+!z%X?ho4&6dZTX%pKm z8WSh~OxEjvHT&fj+Wyt1KZPD4I499ucnSNjWS!{xx;<5h*kWGccjoL9#gDl>_p=ki zZRe~-EUqgvCuq8=;$>VIOZ#he^PbjuUvKgCvzYl(a`x{Koo|UUnY2YuJfog7)6@)K z%l^?u;iXO|pS(xiOP;o)8MWPZuaZZf772W3xdDw8M});&dy>Cb`-O*1dP(&DWbYrk zMq%eN5gG>$HW|Zs_uexNmbA2F0*F)mKya|eu*a94fwU9aae2-|AOOQ zn=L+E9it=(bmUvStuwFZ3I_^E5vE2&fO@2)?kSt=#vz?j%rV?<-G=4Ap9GF}>Fb!Q z*Ob})S3RG}W5NO~PtLsUQwzoYeS$5zYl^z6X9gSv&2&{_ZjxekWjDLKU#lQs6%*rP zTcbYXanCZfr@5ylCT^Aj`D*)MiOj!8zug>8QYRd%S;HnSZ+N7U?+-hrdus z`{!*1?^AmPILYp#{tds*O?@sTztW?13YyF3cjor##kX3sX%O3EYmNF{O;to3v0Ent z6^GecE(gTnwC{Xi`Ded|A~|IoGk*(Onjiky%C2}87k}g@df?Yi6X@pR7T)gasrSY1 zL+r^IA7BjqSnFi114X!=+v@PxHA9cjA9{}ZKNo-2y9t;`o&1wt;lPxdG`%UXR=yVV zJqDvGF#(&UtFC;5MEmBb#o7U!NTtOQW^uNqmAW+uv~Y&e#`wz%J}((lV!f` zi({)GoN3%@YHXF#{zG8DtlP&zV$?wW%f z8sUPL;`Kv+kF%?hRAK@bD%TIL$mHe7ZbkUBU8ku3C37fjeB|}oC%z;S0SGTn7I5d; zlu&X7Sq#9YS%$}l{J4S^p!1wi;r))pzkWy5E6~(p8O@Tozka$xGGAY-wjIO?kLOWP zsBcKaKE|Le1;2jxFgrGLc}x6LvTM(FPH7aS5_hdJB`~`pw!^cD(>C&8V1{eBb@$WJ z4<9MVsmA!KW00R?PWA;)my?(@Cq*qbPwi9_IEP&ll5J+??x@j;e@-;PT6aenjFYfI zwnrq3ULAg2m0^AyTqNHZM#saCLGOpFntr6uH%34!+Ocvmu2`}d^h8Vev#0t-$aSnY ze8;K0W`5y@$%bTC`#!sJK2`xPv(cmRRNH&`jQRVOq!}@gpImK?;opkMqg4Ck6XN$1 z7R`z7#FgTqFb=P@=gO4rv=Z`SrR&z*viV>fxEGKcgCGUpnXj z=N*{DrnnV8&wNT=I91zBjIn%-#)$lVHvkt@EKJPz{+yrxx`~^`PvA_OU7zs- zpNU7_Uq5h+$0{M?nycy3JC8Hez8C?0&FHtg->SEiS?jilX}_l4={h^!W|ez?8lA#8 zNhM*CKFYa89CMwy3=-OkayGi-dmHE8U!#(8auY6bMJqgh2zA2pO`8SiRBv1&c^uIo zbhCR6FI2}vs~lOtb0BF)gFgW*xi3uog}NhIfLXN#+I@VfgsEWuTIwXWbHzsVdQV_i zW6G<(O>8t3*`a4SbD^st1^1_sR%?yoBiwRxg8yNi+S`h5{r#moMtvSjThDNCTS?my z@yH#gF0alVKAjZ}!YUr8+l*zzkHhdR{??o_c)IU48J%XQO;PDjx-h{Ym8_K{)$#u5=nTdFu*KJH?u41h$sfZEs!{uuPYpHmq zW5&6tBV_dzIbLh~i5f(4R^`mU8m{_IogFjsHB zkM9}FkvtwWCQ^epLemws9x3$;bD0MOs6FCc`l+Q&Kd*XjP5${ShzSce?GX(J@0!AH zU3AAaB>vX)KUbBrruDv6^-ZMr9y_~W$^7x;J1~l97+Em~q$x8#m>*3o4qQ3vw+ID& zt+|YTq9pj&Cp!diSH2AVIv>AH)gM*gf^x{@Nb-(z!&>UX?7|PrgD)omYvEw8SJ4$b z2PJzmhZX%ud6tvRR9x@(FUovjYddH@-bw4FoqBEe(q%jo5auoBF1h70=r671ZhUrq z2}!7xV?WKh@-^6!=P*qi5_V%W*?zv}&rhmQQ2LfG@(Nn3`#P}bA?tiYW`bjLLJ^yp zRyq*;4$0owW)TA`O!c-y$aH08;GvE^x3*5-k9uXnSmdC{&oujm_HFla?FC1VEdlbT zT4pOPe0FKr(31s?4wt&3jEmcTO&?9v6n7_Utxr{|KVZk+SLUmf!Q3|p z8M95r>78tptpj9vwn(9D;Qb!-=$g4UgWNv0R!sCqRk@O8xA@m-u_ z@E+|#@qjqEnP zjRf-LUt_P}w3kMfBv_lfKx5{%EtYu*zBH&E>ZAH@{YQS8xn`&6Op($QIVM@wzV(B{ zifL{rMczi?9#bZgnlz@qltvEI%ae_U2dSd>9nH#S*8oRW^_$9cgRvD~`6-c;8f^?b z{0{@z*v!nJag75X0%Xv(jJ5JpFy_-r_l{*T|M2}sCgnSaDl1C@)0~!xnJc}YP1$CC zM{Pg8u2NC~0N{Gdn`6@uYULhugl){UvH6TrFk zR(m`24F1I%{!80Tj58a=Zk@#mnoK{;BouQ-1-E81Yg6@3!U;B8*n%sW3 zc$n_goQV(1MEQsuyh_SAPA;?J{azLfh_!puz2RxIbTqE^SeP(b+1g;VW+PP^K+X0z}oy0!n9N$c}jhA;&l z{)Ep-e$oBPa}(X zzT?koQF~&B3ZL|ZC$hqU38(4Sd~-jz$$NHib@F0+|4zxxt5O4Z-4d`{rWV8(hLP1v^_M_8{Ud zavNhNf8Lb{fvTw8;ZKF_=OcD<83pKSe!#^RJ+!Gwkt&uGyhJpuO2ny)_4 zGK_{r8$~go)?T>sNkB(b(P++^`7D8*zncj~!L3nvA=SyG*xMd$X}x3ml}}~>U8x*~ zxwY2wqVw!{rbW9lNgTjCUaE33w;iJ9(~IMJ5utU1gkk%M#b8)|%}p?XoR;xG0S}N~ zX|-y?>=SR0ZVDAx$BKa47Ch6SmcynFDm1^pH$tyJ~gUFJ#WJL0Fyur_tp zdoog97FpRbH9dPlo*D&>_KdlT9a2cNORV&n@gD}dqW{Cpdj>WAb&aAZA}UG|6r?L1 z1eD&JA_4-^J19smp?4CB2uSap&=C+2=`|GTy>~(nHK7LxNlyGf&-=d5nfv9=ojY^R z7iPlVd+pWss=vMGCo|rA2%9}%Z_UmqOx_et_aMDR_($*;Ai}<&;P))jrj)xpJbq!0 z3NQ9aD>L-+GXWYX_(@{Qshh@I*M2WB%b=s9&+y_!9I>z*kGXbGm$42pa2PQi;Sv=d z+w&x`>kWPn->r<+&#rgWnktgOU!0-F9jKXhH)&hYqshx}Z1|~ z^#HW(tyq?QxKsvNwG7SNbhs3#6vAGj%CG>`Kc zq!`wV52w^qNTyC#8cx`!+BXwcbWjHSMQl|8KdjEfp#_i#AX;?+XF{Vn;H;@5(wZcpL9 zCB>z|CB#!e4cu!2c2u~6Zh;uC5yp)>+y6%wy=l0h;q}tSuTrHH9jOxk3D79In0XnSnd$Og0%K!U%jRxHhG&41>IYjjT z;fL~el-dGKTQaG}Z0<#Z^9m?{aU0`mIW51m%A>LyZP z=^Q(e+_?>q*X6vL$!n1r4wkR9ggE&L#vw4_;sR9D#t^4w>nf z8(p*Yj!TR){mlO{*;V*JWMgeMVcqz<-=7~+1Lc2Uko`POaN9|XUU3|$iyuyVaQy+( z)63g|uK|9i9S4Qsj>YzIpi1d6p6azlPoov|AJ!^WrstcNxLSVRty*c$mvc3)m#z%k zDPD(l6Hm)_n~L^GOoVnilt;td_i&c-(G>9UBV0R8cDkkt;fKA5BsdfgAec<wS>@XJ>*r;my0^G5Fzq~b9i=TM^;s=MYJ>W8?o4+@ zPp51FG%ShzllFizJ(lmgWOiJNiO4}FzgZUUG~k3#Q2Iv>wbet%d{3IPjx6CO$XQMD zW^Os(Jd60h#^0h0&tsjnGaj>FD070zrU2^)9?g&YMnTu(gji=(RF&$Z=e^`O)1_@z zjBe6xwe{+a{N~KL1!;TCUJN!D=#G55n9?q5p3wCf1mm_J(OH)CdUD3mB zRCbtY6vnl@uXWCD{Mj*boWx^y2gVDQj{f-KPy|^wZ z2>6!WKv-B6$1@FgTT_(lgXweM+u_biDcjMH!F8uO$%X};-OSyK3ziE9LT_$dBF>CM zZ+Dw0CZFF8uhWTgaJe)zbawz$02$S|j(3IR)!Y`-i@5raUD6<8>r>n)d~f-9-ke1T zAA_L7uXlxt^G{~m>HBn%Fk5+gW1Rx8;w2Gz$3!zC2ryiC+5P<}@<(96flJ?R_`v-~ z6!N0$HBr7dghT>gNOk9c9~XVu$A0D-j~CK~-W~<&UruU3$J$1!-Xu8vQ2$--USynd zwq=M8p>Orq^nIsP67+)7*JbJ=Ejm(bO9uE!tqy)n;F|cO#_`~u8n)x69d6&+moI9s zQGNHq3c~4kniBs_@g!_=SuOg#2c7(Vt$VdtwMPfrvrpQB3HD5{dka~oI;>Rxs*Gy6 z9pt8aDQ^%wcnDoNhN333Z)!g6X+YtUxB6M@`)ZcGKd-lTJ>ITBwjMdG)jVmtc~kl& zMXc6k#Tj^04(m3Q*PT5R2610I)mdR!!NP_qMj8nZQKHMnBqs4=jUkf@>f-|FD4T9_ z;7fyMZpJ73@rHYI6W+W172-T$fOy9U3idmo!hIXRVAkdR3cF79>YGFcJ2E$GfYnU00WPD4)ljfp zulk#TADfVePO?Oifq2>s=LQ(mWW@75z*71I3rf2C?+;8DH#0Q7tdjj5RDrsd z+}qJT-F7%qp-ys{{N!8&oOcz4@*ZLK_X3UDH_OJY=WmKl3`@djxRg?2Wt3_%Vkkd!0^5nM~>9xDYU zA+uR@$R2Kvx~>T5<{N@Qp?ZsXhXDo8;)ifD*r^>-r5}8$J)YrN7%3*)FJ>v~@7+IU zIht)w_pylT(fORZ_0DgnwH)QnQ+W%g;t!wspD*0rtnsMdFeV;aq7=B-ihE+__ zm(yAl_X|VP(Z^T`HKRBY4{JWym7zjod1p~*7CF#}6GmIoX(oBipYMha(@#bRwomiI zfUieS5&Q9ordNgbE+zCA3Ev0HTWa)2$5%4cPY?hi-O6(;3~a+h$f`B^0M{J)hu={N zrw89_Q01&MgfLA?N>l#VmAZDQ8U-5t7zlssQBSzH719YKrb??j;XZhoC;ho?#bB<{Qm^1mVuMaplFqY>KO!0KflC92m^Kpn577xaZBsv-zq>!nzUehU z)F!z6J~_(*2q=Zu-#-L-enYk?uY;Re5Bo|`y+vqVkm=X3%|e<9Zpt{a`;uqi8@rJ( z->q$djg<6cKX;w|o`FR1$)x#cRdIqyQD9M0T=lc<{zZa-z z=aKB!Jl5<0*>>Au6ZFiXrkHwzxMg)MlP!=z~%aon%y>=L6>7lCR>SWi+yO+tCjUM?IA z56GVHW@Rkm4BDQX+~rSIG_tnsisojq`gVi!>D{`vLs~0$cYR)y&1n;`&TqGBz>Q#k zN%%nXWQ6YFk?9i7$m$;6d%1J~b(h&+8CWSxVuI*F+yd;Z!43ZNdxnr?dvQ`KS6QPN zokAx04G8C)_eUB@k&40akxH2FHsEagsZ#&_lbn$eBA>?SA>zCT=Uvd&!TPHQ-y_jn z=>?0~S9y5L=r)H#Pvg|F>e}iD?pC<3P~T67CtWO3*o?+4#B9u}4js-9s23d<_Cl{~ zkBSt%VHa+yzCX24&CqH9DEeaJ$0nDmWFG9}f+=>`bw9SDKVQ>nB#hGKDd5p2%qAZB z9`FzWkF^Lb@E;d2<6J8PU$>ARcdXzWn8hy%O|InSx~M}N1_>J`2K-R#!?@Ba$%Q>E z^+M05Ym=#mHwAERid9-EYQ-?nt4c3Zed3+Q`0_-9FilfTg=?v`*t*kowrHj5r{iVP zlv0;K%(TTq4Ay7a)2X+k-M7BjcnWzG@MnI5m+@9_F0uqN@6X-0;7aUJs61zyw|2*Ad9 zytC?3SI|bWVYM!i96E<7QkUhI$+IV7^}irOVzs(UPI5Wgb)6d#vFh z1J)r>B_8@L!3S6=LS~3=>j5nnWcoKQEK~Nx&{9EjOB}|x+xH<9K#|3axI8~(%o#nm zGF#F16B|i{bgFUAGSK8^F5)qD(OkXNmC1GPSed8&w=C2drl78;hm0hkJ{H<4t$7{PxZtBe8%{s5>{9N@~hJ1ECkRz>n z1hmD7B=jsw?{8S|UV`eN(uEU^J-qcyk@p`Rv@Xt}YTSKV7Tgm~+EPtVmZmv8NmMv# zW+!G5I+ba$)ou~k2*Sy47aY?V*B&~(I>Ar%is6o1$_s#)cx&oghU zOc(ZWkcAq%Hz=jN@er4<>Z1o=9LgasD~XcrA3Z!g3{p^0{l&e!w=Z}hQ~j+@epImU zgSgocXq*+{p5Wq1`!NO6te(MoVU0v})tH8IMrCKB_Bla}IayU2UpJDX0f(TS{F^C}#kq(L#Itm3)i5f|O(5dwHJ@hE@hmIay?u@6fI!U>TQ`F2 zGkuH!rXBw(_){F^&5zBp>7LEn03xH$e#2zm!LS3AuF~21E~f;}l_~H^T)5NKLfOVm zEy;{2q|q3rG=Rr0xYjpb->>h03ptBGecx&l;l$CDMyXKvJ*> zh6r#o|9P8oq+%(}#T_26J<{t*1T@V@w(=`(uYsoJ6{73`jw5SqBa{34S0-c&(soJp z4&=?{wG5dAYfU)2l{z#A|zF8)8qh$P&X(usraL0Yd2Ey5BUzEXr}W&4@0$# z|B2K#KBBg`Ob0N2s{Oc>F#DPJ*J!!i#i)Lv&@JaSlKtm@`Cw=ZqA z)X_S>Fae`+w{iZOgF(iu@3U>yS?}nHSH)dA5G;VsmA$cP#(X|bOJCC`G2$yD8BKGZ z=Q5s3LBCHK*QrWW%ojsimsKXXK!5zEXh#=7Si&px;=FW=c&km*{9!Wq>IGnlErf}% zyOp1H*rm;7F=HN}!qsN+zVoa4yRAd9g6#9YTnED2{Mpt{25iN(G9bOpXU7_<@rLtg zoo`|HRY8|yb?ChmlA?158mkiit>v_<`nF`@z06BR;ldEa^3HKdJ}V4#>D!-F+>48z zGn02n&zroe0lPvVsL%+!ry+>X!g7rc0Q0oly+oIMd+tqpwf_8Vho^C(2!^^qpL_v3 zsex#Ku-p+}51sUTWlD#}9%K;&P*|Ya+3EpE$I1k*2Jr^*CK|E=c`XY2%LAPQh?%FE z`6sTN+m9!fAI%QCJHj@xS;itA?WfTeyxCwN_?5W$>Bzy*O}5p_^<2w?Mv!WIK0kID zoe@*bJ3K8RwSOSQbXg0a0WNL(*|ouMaU^#F#dlIMLCzz3Tb@WlAp{YDKrn-yQ^Ozb zQNm*RVAQR4u-jWCwx!p3%kCSd|MynRou1JCQ3=t0K>cC`XjX(V9CHzvlgM781Kf3- zy@~nIaqDv1O71b!m*LU5dfj>Yn@)+g7_*JeRmju0t&mM|nzDxy-=9~|E-V!U%y})n zz9M*oWuA}sbt@U?YfgZeS8ZiPMWc9hTlv*MtTCMgvZ?4~n=2348{JZ9!+6F))+ibY zbkwg|aNf80uI7l^5S@RDB)a|7MU#7=QVOD+u~Ob>t0%D!C6ep)fOxj+HKdW!}@ zUX1q$0*aSGf6yoAd%c#~{GDAO5A18pajcuCniMQ+u@#&~S^gX1T1G5Svr|bEEn3`S zow=mec5=MEYGD8C9zKK37ZK;d>g%Z;A)&Lbb~N7DQ?VS>>u=L?jB#%Em;(IEukMTkhM%sf2b~Bt zf|$WX^r1M`D#d^j^F<{hSyJMS`-UD%^cvg1e6v&)ExVZ`n6b5uMbexEF<=1$S(jR` z1L(_IEF1Vl;}W;gDBL#i9xEhiSAVdV#Hq3xzP7naf3p9spXr?Pi%U*40#gZ}mPb4D6l{^ad_4+z zdp_P+84fqGzwruo#dsAW&((oXGJJfyjvij8D4cx`giX!Bbex2_`RJv)F*onW5;3Lm zVjkCn-wc?*)GGZJU!9V(#vd_;>5WqDm2geNW~k5|aEa}#>AGO~%=wWF+PuupUdz6* zQE7*OPQ%=zGm>ps%DhQtnZMZ#n3MQOZDx`FW^N?HZdg0VJFR)Kid zl9W(1*M<0Yn_b?$p&N&(c@n)EzY9Z+nKo46FV}9%)_uCuBj9gi&`U~g;zXVdVOme& zbopemSC2Kh!W=(uCRE?vj+AJ&H5FL+8k(XvL+>uE_cl9@zRgyUj8bkzSlfT`PEzso zxhYs~LgZ?crxyKdP~VEeh=)wFNKurY`u&G}UeNhnhuG^FVHoN`Dhl!nrgfqd8s>ca zIg?z?WzLuR3}J$^Fx!fIjN2{KmtCx|EdFVR%teDGaqkXeus(1Rkyy7%;h0u&byLBZ z*kUC$6`Rz{1AOI#DJ<@VH4<%sxel=zdDmye==tKE<>?v1wqHwr;U}lfs=QF$z7g-i zb|<}918T>62@A>Qcgytb*40beniM>Bj2+5$7vKo+3)EqMPG64e5?D8U2_Rp zbXDc!e!znA-DL85sp6PN>Ni_S5%sz4-mG>fcT?f()Ih+kYVnl*igh=o^?hIW$pQ?` z*G)`e9h+r|X$xG9jHS&h_z=S~)6qXAj3kub@8530t~W5CQRh0ae5e6aaF}oLkG$Z% ze2b-hE_*&|?4{0OO`y`jZ9F((z>MoL>_pW2JK8ZcxcxY#ViIF>CA>)Y+GlU21om(l z!WMI7nbB&Md63-?dkGWyY)9LhMNROfkd$s*IJ@m&)IXKXOCG;@x1hNo+Owg+#+`3t*ZCP= zRFH%r!w%^MWtF*azvUxz2FCp^4a6;_xO1~1^ z30m2oOYV-`ksqb^P=)o-iB>!zTnbLuoo6c83YlKA7^Mjk5b;U%jA3qlSNPuMPJ0Td z&SBJyKBLgv#!)7PZWK}u=Pg~+fS1hL9pP7gW6tCceGIJm<3DWJvu=><;%xnv)WQ9T z32ggQC_Dl&HbUh!LSkyv^+o%hnBZ&1xCwSFEpWj|P9DE9*8KkGL@NO$o3}0a1P2k) zi$L+sY{9o}ez!R9AFq!lV^J&LF-`WeJB_dJ9N+L!Jk{f6_^EgO%77fw2d5-BZZ;lK zfYiAJeFQvh6;xsJ8DPjPN*C1&tUVt%z5Q2A8^xl2SR3218E(LK5J~CG32&X#$J(ck zYd68ASy+m-7?72ZZ_-9#V>)569k!;nUdMp`dctRe<$H|dBYRghNW~uJE|D~vWur~P za-K<6msp%F#JFVzxX-rzoa1*ZFWAw~6D{6j;hR+-sp;=!xH@mwcAkNr2fx%raX}i@ zkh9rvh41>C=G9rFzS0Lj)s9UsL@irwY3G6$urBOPHo6RMD>u0uFJhOFF2AlbKWWk; zKTKleE<)@I)hp5$udwB)jGs&83~A~|mjmt#1Sy0@q(}s>)2Pp~fRrX?M&(vZ*lXH% zWO_N@Vl^pCb2;wf=&oC3L*lpLgD$xyeaj9#RdgW!)yU_drkE$!NDza*l%W;q#+CyC zaJ`t;UXFR{f7Xo79_3zU3e#4*!}c|vBBWV#P3E~c0-s#z8q%P-Z8kz){D~ zx$*pp!NIksw5{1RRA}V5Gm$cKu{v#$jg!(O|B;q*S(^Pir6JuY##-e&^Q=S(Qpi#w z21OR*=KwS>L}&N!dBV`+r)`hm$eMHC{#Sh3z*&pZ>?#VjR?)6|w7MKepC09EZa7pp zb^hZB+tFDu!69%i6JbGvuY-hxi0mIaoXoCM8DANY4Q&hCREx@(afc>)b>=B#{R;Xn z4R%L(hF4g8qpMJe@%Xcr2SNQw&fupfFw;^WWHR2f^Ii#ZHJ}}EL%2D!vx4d;RmSs5 z{$uF_7do!b71TvHH$27xdx+kPE9c^kk?yC_l(1-xi|Ar)K6^W%Q?!TfM?6foaZ^89 zO&GaoxY}kD*VvFuHxZmYXBen@nVbMOp0|%pzPyQU=fGDL<`bzz9dDs%Pocsbo0qjr z6vw$|VcBljxB7-Bz-B|QYr=gt8d{GGuWL5B{ekX3?`tgfRyfkhY}_&`rJbblYUrIn z#b1V>!!jVEE?n~qwkAFod4!eJ`%hJ)-X}hy(;vn>JvF&5J$zw5n^AUvBAUOp-L>E8>hDtsRu9@EDqU0ey@#Gly;3`!QIVa0GPG4m_5HzoARWCIL})ie~(C_nB#4${I5BJtoVD9o}#>A@0ve(D)O`CNtos{1{F}^ z+FFsFU&w#*oSDC87_^c2py1`$1nT$aR_$0P)iW_rQ(e2oxx`<-36}A$vvv85T9OzBpK%H^eKheo5W<*taJJ*4ZZU!BoRw8`b1-9p#zx3w~WtPvLB%l&yB$zMN#|F5AVwto=~%&84P-2(mNLMCmI zm6x9vB9&1AsODEE3a0wVh33$9bbu+|U!-jcQTO>cKa@{Tv^afkj>MaAhL*M3 z&`q=h-QG>OM1ogd><%yfBG7xEDhy&5oFJn3ICGg8PRJwm&u-Xh^z}-9ltuBYXUh|G zcv~}aY5uk9zPmxOEH$ z9~Jl0RC?S1;mW6VJwz^gY6qcir`$Q4(M92=asBo zJr)NXd&=VFFx`R^`UO@lk$7?Z5M)GFOF;R>{y;5m;pJ@n9l6y2tsm2s?x>QmbUnZx z#mO$4qm60t@WiP8ViADySH9#Pb#3|dONK0E8nyV-l!oT2Z6)Siuk|35&`;wIff zqMqU`I-bL=C8$d~nw#)HA>2aN(>g%_IOWHse*X;qo%269{9gkg*%Ha%eY(zW;uLV?pDynj~U7GgX3v0Y%u zGVpDg#?3jC@L>11I~D$WMGdixAWHK8eP)8DzVyv8$-SVb=HKDalg@4LOH4A0_Qf)?WA<#hee)F&jo`lsUvl-7fVjO%9o3SX@q zgxd1sJngGyYdkzRs{dwU+xi#%!=J)!Hm?@d_hLp;6Fz6RT6Vq}Yhl0(z~25ZRH30u zsH;*IL*hZ3__a&N-0RlLrHbT1v@tvG2tNKV$pINO=B=sE<_LMdxmPXO&8ju|@=cxL z;~hb9s{e1bOTJ>j+;ndTOPY7hN;zv9QJw7);;lk%{Wp=^jZe<;=?!a+N{@+n7!DD? z@$klu|7ZPE$}a9@*A`DXEreqnhKC3bkB;P@6M@?R+l`l1wJnQdGM>S|B#o0B10G%u z`Zf+cPTZH|+ilpT|<@tb5inU%L)` z??zlU)8dVF;32`G-1_p;hZ50$R992w-c`<}Sh&BY<$b6azEA(G^H_p6%;mced_nn~ zn(E^@t=~FEO4E`-I_*vWR_0*H>qtlp*%6mZqZ^AzW_6L80cM;A!}AEw?NvJ(miw#d zb7!@wj!@c{>@-b^9qDZHElW{~Ta4>Fe-X*p@oyl-Eh{27B{2s)-(HONAH)?KJ@lPy zxD~NIdgS?Rk*Uoc>#ru|zWhiS_ed#-fpwKZfO@(a069JHWx!PybKC<^*d#Ux7G190 zEbw4{zC^m~Sl#C^hQuV}zIoXV9nQpu;%s+L)ZclX=|v^HKz*ya{t>M`CdEd5%(gRn zC5uoI3&r_Ezk4`=FWhUunyz_&Ui+;As9Y#X%&Itt5d<F4#+pa0@qz2rcC7vevuW%Gw&l>rwLNZrA?0f##{lTB~*`x;&A zYaUcPR*W^pS?d28QWZ?S=Ji4hJb%Ujm70WF*dYD`JeD^MQL(Zt&0+c0par5lvGN~F zvEZYf`>cm3Ph0>(g8RP=4en1H9?l7q{tf=goTOKY+R=26X}j3JGv5!Qypz0vHqHe7 z8z-Erb^`Tw8KxxZ>S)i8QB=%c9LCK#p8t)3a6BUPlht*D)IaNk(7hEWU_tT4FL=K| z27_l)v}U6ZQ!q1YfA5@Q*T`TobSE$(NDC-SA%PeXCw}oxvtW4l#vPSw zT6gIF>ZMpF*)tB_6%THl%*6f)C)`|+Op}^-6f>XAtFMMYkqlUGl$3Cn)GJ>DF#U21 zVDLUIaw=Q7EH`+ApymbWLjA?y3q;ddnbRcp`r(mBg*sFuYa z;pCJ>^_O8{)hJvkPe?)0-9Nwk`^`_Hk2EjqVnEA3;`=8a=U386=lR;-nW)GtiWE{% zmya5^ee%{wc9QIQ;C-@OW<{UE=q~_Q5w+yxMcRtNyXUf|{gppT_^INupFHq>aN?9h zyqsb=+V)VVjD6jpmXky5L$SK=XwnzQ>T^9={Q!g4j4b`#zeBdBS<8aSLu6_>YYl5u z^X1mwmUrlBXxsg6eosp>Q98M9M&Ytxc$b2UB2kz3#xJdV?Y&x9tIfb%uxJRla^7P% z;2}-}+{|!_!MjrfS5WkE)?P-^c+ib-RW%y1+bQTb$g!#&J{1I9<6g2XS5h3be)ikm z+qg>@f{=*SoSL(i3705LWb1w#$&mWmb?Q^dhi_96EfHGigOovKyFK03lbw1R&jYDA zQ*CJ2{ZySY>ezHYtabwKT6lz@Va(G+A!=@G#AhF41pJs}D7$o;-P=S)^cr)}NeD%i zs=3^#X!hFAdHQ+k1de}s_gUU4*0D2SXtAQ9=I3wtY3*Y4cFKaiP$V;x+!;Bc#S))4 zt*TFPVW)}*DsD-mQ7-u@<+P;GSvkWjw|=vKXZz5|8fK~V+ua0rF9Qkh?Pr8|c(^Fv z?+wfif~>tU1^`X8)Fq2duanFPxh#JBi^?n4ACSF0X!Ne;PuMVW>`91Tq(>cP{7&yt zzFVM1yQGk>v6)qRZ8J$Dqk zmp$HMF83Knu58mudJkQ1_5C2)C4LU1)|2S9d{jY+cIo%)p2N`U*S8^{i_QD^zRgrm zPqjXbQLrqdBp#UU!~l)4(@k_)J?K$h=-Jh`V6y9gT#jR1ViksLY6RjpzD)C9Ll88w z3oPiF_Kj{iMwdR{U1rD_IamirAHmMeug`rvFw;7rXMC&Y09{NSs!uS5 zI78Z@C6iTtlqmS-dpZFE6fAVompOt{`M|>d`Ie8fEhT^@p;py@*ZgKEj%Fpze$hMIKC#~nuSaGbXnwY zt2F;9d)F8#6>EHXA`H2E*Ba<|AyqSWXjy4|nJp!Ae%4ZkV(x87ovcMYpa!i^T9kRx z896OSa=jz@tVi64tp&uP>=@F-U3>a&J#EI%y3JgA5dWPr`9=f42Ra9N_J)KuEJf~w zOAv1*OQ5zi5_7OO>s=IzAL)_#Ev9Aal0N82*^++Z6q@pq#6h70Y0NdUY@MkqyJh#g zqcu@B1w#qCII3CNpz%3V)nbD1(3Fk(fy;;};tQ_EYUAU$TO>(PJg^t3M3EoI+|+a6 z_*7okpFF>UKKb=amRw)0Uncn{$XmDCyYK!({yqkwTe0kV_N_l3kj6K%=Ps?n(0A2^ z_DDf}QNLnh@Td8q{^*0s#1Cz&^sNnwK!$$aVUDsE+k-JfUIsNyEp>}wJ7~qrk_wO` zW}3;TxKY1u%0$yOVof5 zC->=_Jbd11{HzDvGB~xC-8E+brchI!_noIG?BKU@qdtE4Wk1*!qhtR*Jj(CjdXOnN z__Dg`;is$-^u04zTAqr9BwdeRzg*DAa&AB^ml8XI!TS$;754FcdKIEa52Tn`Doo94 zAS6V1JoJBaLN`@p{*dq1$QuB*#>eYjOm79aM2FMMvx27-$>j6rkP&`w2+*oE>i3F(Pd`l^KMq7Qp1kDvT_^pWGn|0mumxxpGx zH!2^fPms%qHuh+TGP10B4V5eE%{og5n&WKhmF==>w7GMn45ud_yy55<4O>+DEB49Z z69M_{WNoPbg`wUwVhmrF4udhPsOLier)$7XKeEtmAW$ogoaX$WXIcL_h_ z35tm8x9aqa{-KiF`n}uTw#33TKec1t9}1a&90e+l_pkYceAq?=-whyAhxX$oa zGA6EKcw)XVzxFXXF_qrUge%z2`TkyQwXnmPu7{)V%Z1bQRPRmwXqNxV{7&IhcHJd= zC5!+Fca6wYMM~3~m5-L!{;XTU@&8y5H|>k{nWE>L&7w`A+Nbb67(GJQ@moGs+PDMLoihyc(@QO{Pcd?|aK4q9+^=??s< zd2ac=U7q5$7tl0G?kU+1#fbZ@KLG=oWd37mXGl@J03lp;#Xu#+6%v`Jnw-dl64Eah z*&Yvqy}UzUZN~IQ!Kx_g5FD-XuIA^sz5m`M!vn5;4KpUIsArkkRG|V8`~8)l@!!{; zl^{(dMfJ)$oNsD2I?&j-h*w33s)@bAW;J^e^^aD+GY(Wg{3*D`Rn`lSNvn?26lbG% z#9s>Plzl8sA|=G5iy>MC-Xb5#$|=YSaMTX^`ZQdsV6YKf^gdzuc!mT*$Hlq-l%7?P z^t9%QC;vgo!d=S25o+6_i-H;Td+>#lkJDZ5|D+gr37sY9%+!vm3Vt ze((=w$ihLM^XUK`2Rh+`(G29$Skf{gWaFw=|sfc6E0?8I-MtOAZ?y`BOoT}QAr;m5s_db8^GzTjS_Y1y1q-WqD(yd`Y zD)XAZUaDzc@mYG$VFRKpo}8#%ifKvJHo}})DZg=B0Ax2Vvbe7xnTJJKA5n#VPN4Tw zXCVFc30^a_$?tmkc!IWVxVkUreVQbHWd&t{cVd2iPEec_DW+*jJkONhTC-1V=i1RE zIypyhyu_TDFx;v#JUO4m`f8r-X;Lpgb_|>X_D@Z;D(djy0uF&rFP&*k0abJ_OAUOP zJ-0AM&5KbRE2*S!bc1zRBBpHk*O@A5mOcF}A1lh#vX229N#VsoypK{sIMDv=umF17 zBIENJ@>L)MjtoSW2LdaFy>Dz3V=P^*hV)wCmwI@ct# z_7HF6))Glgdc_vd<_rCIFVNLBE+Hd&^f2|c%SX3V`oa@-D9_JQ<7QY`Q1Y^{@5$w@ zXSg`Q?CDRU3Hl}IhphR!W4*Ped~bQdZ&#h=Gbnw1 zo)@c_$$g?16^vt5@2@QgRtDAzU^)~M-~KI)O@+-?$I^16MYp@k{F8G5MK9{_ zW<1|m?&pEbrv9{Xm`^4&DTI8royPFI9Am*l`jRcRcExCz%H^-lu1A zy`E=nIfQriATm)8a` zE6o?*?-sO%Wq-$ur%)t2ByhMZ`SOmAt>L@8O0Sv3nO^72lRIP*->oE=M#)4J$+CI# zp%IjNHo(U<(S6`>(f1|x*?8~mP2_yUnZd#rM;Ryvlgk5@(+c;wTwrTN-uNn@>XEB zu{|Vpzef1Pt;|{u-D8tpKdVqBW(@u^k%MU2DRpg9I?T2O20ly-H6;tJ=@TC_v7Pgw zR3%?YC}E(i2-DmsnEUnnuEV4LSmJn&#{!Azv5~{~qBUqVVa)e5NC}kQ*zn;ye|Jls zA4^Vh=8S6>#Wx=n{w$IksXN$3@#6>Fpt!n^HH5Km*Qb$3Gt`W3|26po!7m*JSMQAPT0RjF6ympehnVSRVv*rEPOhxv!1pu_xPn9ELk?sIlQD^9xaZy)6{sx0n1%(Ke9 zyK_c7Mf~vM!QkRbM)j+ntVbWB$XWLKN}4)+Q~qUjNwo)!{S6_$tWpeOMis zK55M5N59WpfVxPJuf+Fw&xzA*Jn2Ji?cX}$j8vi~XBT+w&RSYsGh*9`Z%D_A_g0nj z3r@sEwCXI?45Mi{vOYYilyc>KJGsK@TX%m$e);HEfSZjUSx;YIkgA$A>#LUYRP`iU znfe42q*M(^c&Cx5m_2@-cYqAV)}$tB80Ms&&o?{zg6tMa20x zMAp!7@Z`4*NtAi@65z^TGM*b8Z%*hLLzn(|R`~V8Uk<%m1@5vN`K|u=s8T7j!ojkB zt2J0lk$R*j?=6FF?Dq2c*PxjTlDCef*uuw!vAa)i=+GTur7{vinw+nOh!8Dje%mR4nq3 zBE#KN^cA_wm8NpuR9mJEp=b(*u;4{nBYPw=U4#qfC3IJ_7=KtKliyO+_;_~_!_ym5 zJZ{Ddc;z_EbLF!iceCe{o>w4rE4zfiCjKGG6!W+JVv;>k*N>HbjOBklmP9+6eR12^ zXeW|NKJr1~WfbIp0Xjj&zE#O0mBHh}y&u}LOs3t9`f=f^?g1Q%3|3U~ZXx8dNO^>k zsdF|Wp_)07B=jRgQ0O4IxrUWhgpv%I7K3bRFR(ZZtw@hlpj zELhGO+)Z2RID;FfiJ*~T%H#lym1^v99Gz9H{^xQ{FKs{{y~qjXLq@jIh2fYgL8I!a zLy=0;x*8NT=*LH@djz1^!nCvAb;^M&h~RONiZ~T#LlI0B+r$tuB4kRGJY4Cjv>P_o zih>w6D~=A7xvHI}opr2{rFAm>_;6XM0!1X|-3^?Br_^Nl+-?L;SdSachYaXMwj=k? z_}~2}3JPUieoQo6nvIwCjtk*pS-uO8E8udur;i`KXyIf2E{G37Kk(t=iVa7L)*j8P zGMY_R=~>_zV@f1Rsm3w0TjcB!nO6)>*C^y_1w_U`ivafvynyT*HZz*m${nS4!`51Y zz{6(8l1kcLVQ<_}hjbV`DT>z7&I((@ni^%aGH6@`siw?(>a9mjxRStFQ9Dqt?H7#P zU%#pf<@CCN!Jc>co@RcyrwyI-2hnoSV9z#(&Wx1>$t3r0s^v@!E+>PAg~%dRq99NK zv@{)jt+Q!MZSc4V6k%YZ)HV<&J@4+NzJZwF;tU&Vmue{@7=^2nG{KifqMaGGHK1=1@w)<;JAO_`58?#be{~!e91g zJ!^Nl-^t1sz$7LpuOpEUx8H~YDLBM$d_X`X!i&2M8jKCr(4WCo|i_D_uIpBz0QvwKWPpuYldgU7{W8W)HtBpf%L3O(I}$Tf{VADFbZZ)J1AH;2*u?XQ|~hG zYh*1_Up{%b0xhI;GS131ZK`IQn9yla@^F=_+|jVET5zybkX+MKM+MU8TejmSb*e6K zY#2ID(~f$3o&_UuO@=OLs1!OhtgdoZJ5aEqN=HjVc$d($p~h8a4;mXDkP*n57{lf| zw#kJPxOQl0KrbC{=W3Uhi@bpGV$k?-b&}Rq?=o(z;hp^Xhc5*8`;G8hp67VMV09dR z3qrlwo>gAEJ-c|xmLmqclOm+%IcPb{3u-wX6Re934!mu`pt0#m8l~Ld z%#-XAo=h7P@SxFQ>K*}TgZ0bHnI<=-A%jLo zC}LE+n@2)LXF!>xp%fHrb{jU=v-K`nlrm&e6hRYAgA19P($R`|4ME|K0*hfs11cIx zllrNVa;blv)S_Nwc&Tx}fdugZyH)e%{yhVIZbva;hAwl5Cs`U(F8dvJ@K0186(mX0@Vt%#NXuw{7;v5zks1z zgHX`=F(L98HPgg4uBs43DcQ$K661NPsvg6_E{~!0GTzMAuc;9o95NIYtOy*Umq#fD zUL;6qkQ`T)!*bZj)j2T*g|mV(RU6Pdh*A?M9Tzz1{edmlY(3b7_J*V68d+9BXa@(& zV^k8Vyg+YwmIw!hAfA_I!hj|o#H zs@-+&#x>O_f z=e+k|T1>dVivzS5%y#FNtl||1a`LMhbINNukNZ&=z;K*{VIp+eD~F_y$wS zF`ZIqbc(?;!9s?pi2x;y+uWjF#?)Eslmz1-WH^VTLYv)Okb*FYL3)`Mm+oCwgPZGZ zMwLQl=LrQm;W7p5uqlc9S{J5O$XsNE2BTzz)nu(#DoGRU`A;MKfcq~V zn@<#08Z4A2nWHK4?oU_pf@pU$&s;SwF(R1d(ceUxB(EBlNw)8Spn;1+US-|E;;J3l zrJGL{mev_nav93^(M957ICK&9j*lGEE1`c_dc|uxGs;X273C0Kt=~jqO#Fd+$qH2m-1o;`5? z#bVj+6SsZxE9sp)n7}w*z@?$3MHHBMhk~-Y+}Drp_sQ)up@BEh48wBOM(fG4n%@rR z{j}>CD+r#c18~oj)0j>riwFw1G_(8UeyO9=6a8IS;G)vS-7laccUqM>+t|=VoN*F% z0~?VdFFWjxA4u002x_fBDP+b|?wH=v!l(?|RE-?P>Z0V;S!f-3`EfS18}70bZYvv- zD6c&3j_WV4&gOKX7)DXn8h2DLdD#(nOa|R}N|38WrH-yIa>n&lmK{PnLc7BSmJN!P zSDtXk_m@|mWc9H)#}T$hHln+{?5I1wFI`_KDD@&Gqm314JMxl4?zrBRu~H-mF(8bt z&hZrEC=-@UI8B7x#)c#-Di6E54WesugaDNI6STQn`XVZSOs1z8a#5kD8;aa^m<)UD!xQB-3eWHcQu^AC8@#hsV=D4?puEb~>>VMQPan z)8CU;6g{);U}wH_`b*w*q)r2v$8Vk$Jqx7|MZ(j(0PRr}n!krhAD{7H^FBPo`l8Z$ z>nfx&dS(>z&hc}4f6`XPGv)Md7d`*;P95ke)gK?6U4EYUXCs6AFDk1K z=gxfPn?SYV%nx)Lbl#k9_U3T|-kCR@Bp~Sr{soJ}wd!EbiUT>twM~W9jp(kE(ImS0 z+$_iOe7~gVF})JUXLiqsjfl_(_`9%xuju0L2igL-|Ds}cxTd|dfAS><&r4`lDmDQmPk)C!r5A~{}gFl27}G%WH^!}q;AawLE=QW zu)t{p)8>xq8?Mu6RSJKX7HAXTexTKW`~ASL`?GHUcm*f$Bq2SNoeX>9m;oQ$I>X=P z0y-6?b%qrOvbUfhpsuOB-b4^m&r(4UT`b4)!uZVO%lh;f+#@zMDlAy5@^@i@)&TAY zS`E1051{AoCC@HCTwE!qXh9Uv53uFA+j_@G`ny~}C&cAu4;EJ*EUws*Q?~VFvBS-( z;y%Gi_#tcu3_jh4||3;^Ezla&P)>@7lNd*Z;xiE+rPD-O2 z;MAF`1>El!e%f{7){mFd6lHU|uN&3xv%6;byF@@I$Kr71S2gS_EdOrD@e^e=(l`^I z7AKq_p#MY$288O=R}bxd^@z+Uz1H7_1zJR>bw4ID4Q|fFU?lWYy)fW7COCO}wuv>f z-Z_52{eENk6CWNetqD{setY7k{xF_Da1q&GP`>(b-o|5vb!Mx#p;Y```WYyULn+?_gwI8Ir*548P=H>FA{J%t$TE z3;r&V-(Xm_PtVlrXHM~VLAoxrMr(FO?S^ATyYtHS7M!9fN=Zut2fJ9#;bwb829HQf z7?#p)WLmc#(P92BEbs^5exT*}+5O$eO)oln5|hJB-J)gaAHe-U%V`(x zN5V@>PmfQC=Rp5m6=eN_+`QtVB8=t0{p}WXLvK7@uzYV;PGxO=b)&`MR!BVr36|$= zE+#HmH!d@IQooe+*sz|l5&kZ0mjm1nw4C z@fJps;QsbcQBBj)(y9Z66^mBqlbr^Y^_eeE{z7SX3D-x#e~1j^!`ibE3g) zmr*26NOb`mF9Zjuf&EeG5Wv!<{76Z) z+3qwu-4r3sq>kelg%c2Aj?$zloUIiJrA7~eV`#T>8iYl^B)$PkawdrJ0 zX`Ml>kjrUlq#zf|X%uwt_^7ezNh4C@ho{BMy8H+q7lbb8e!Ja%?cD1QtY0y*cMtx| zuqYktB>+hf4i~fTMA`8ZCo(fGUey@5A7~eV`#UQv$De969WJT-Wv}!y!0KRVO6nkp z5;b1nBZIUNftvn_G1rdlJuEfe--YeibV2vqY__ZCT%V5DzjoV9#?1*H;dG?@U&LfI z*pWpkJJl*;g{|`@!htgr40tN-DP(%o(Y7ckpgIv z3J=sw?w@kykj#{*kU+H(hHw9`bXoWJ#OhxE-%N*_L6?$@lxCsuce(xp1WlXFmh`*d z+^}>7H*bvIo#-OzHI2bAD-|6D+N3tocGhX(8pK^j29;iND0nia3a9pjBIqWu$ z<7kqsZZN2#ld`k2WWJm+3)~O13&8zd8Jp8xT-&t%c)`Y_MSF|NMP86*4#07aM~Eo< z*J~7G(vxo*(?2OP*x!Y1zjSf;qrVo_HjaAqqi-IXGb%0K-2q35MZ#2Tl&f|fduZw2 z{M;OcLgBl8f%}1W0k|K)JP#lg1gtrd_tUPERYtSb;bu4<1p(->L=uFIB4dNKH;(N; zE+aW1G)S$4Il#^V++SR481d)_%O1IYR7R4^&G@@?tw9x_T($Gqz2EQ3%R%n<;S~<< z2igVTe&A2+FD&1Dym)JNaY=1cmBB(&Bu$dMAUGM8<@ibcQl<|~>z^2%6d9t|sQg{n zb^*BG>;4}eo;xZ%(Zzaf(s3&OF;DMAom&3uuHz5B^VpMR@#T+67+z+%6 zK@f5)8@}6l^w)h^PL?y+oG6TzLTV91|F2fiLz3ezAJS_~uY`~QwNmcGmeTDFU%21x zd2=qOsq^o;qR~A6!{wLt?eW0W5$(qPtTrfM%Htnz%_EJonun7tNnGYskeLnGX9`_v^Kz?|&yVA!^Zc^ROoC#VptbgQm>&O__JRw(#wb zZodAy3%Fr$e=9)`nZtQSpZ>aumQfE~J1ssk^gJ)gX^JeZs(WVf(){x3QN7cWVk2`; zRUODK9GI5;+Ff&LnrbI6|B?F-_;Tn)2e33H?%DoqHL26wtJ^ znv3O}3?rv#m7H$g&LQN*Rrvmw$EtxQE`057&S3@LaRW+EHHnpN)(pYYe6l zk9_#@qK|L5Ztex#a+h;IO4Fsm#HFhM_a3kC4}SBkc6wFOL$jgLB9)Tbc`X0wPnV&m z%-av#m=YJsa>769AFaQ1%iedsUo*US%G2|&(rY#7=DF|nFG_0b7tXtCWS_M2r~cl` z1+;Ts5KwUl=?G4r-sUy~XB`*z*MlOY?+|F^KOL*LCc?XA4BQWNaB57Jk{ZJ&8xL$b zSy*SbD<}$G`J!h|fdHi}CRo=mF?wvTq-#cGl4K{GJ?O%^f5n09=ay|Mt22dYl?ma& zuU-T=Y>x9MOl2hdMWBc7PVF=;b-6LPG-JMsu>hYU`1Js{wKJdlnLw8Ra zHfLm?O(zQ9`E3{aEeIT&IVAo5DZ><=@oHZAdF$cQ%8afN~c!3 z8E(n8L*MNGWzOay>Gc>fcl@ix_RB7K{mX4>lK)YmDDs2eR$D#56wB-{R!dv z&lk+Kc;3m=tZPM6j}zT zmB^ZBXWo)u3Bu`g+gvV&<1tT`kFGzBLcxnDoE6d64OaV!!m>rnen+wJMuiHOZx991r zFBzH=cf7Rr$?w-cJY(ctlZK&(&*T@rNR1Bv;MU9ET)E@REr*|;eaY3sGB=+ny65wi z^TrOGG9azA!LVS-n%gH1xqI?3mz$mZ(icaHYi=CZZ}za>hl?wp`EkSIJFl8GG<`=- z$yKj^`^@Z1CuDZto>TJF4;!Dq=F<6>4*I9^TdmgFbFb6Z=PkH;0{Vd~8_YMn^V6?S z-aMjL!r!-QV_wPE9*HGJp{u&&r@0XT1 z`;wu(dnCvxGP|^D&h1ZQVfvr|4LU}X#frIEjOTa%@RpqVn-};EYlVnTJZhGfhf)GInfn;a41?~=Ba3tr#~oG6+lxqtk;@dE=C z@*j5{3l7kX&rJTOjNR>aKmYvm?|=LWj}ek_1m13J-2Td)LsH`Y-!gra)gkR2-8hGGOW*lnb$&%{-|lg}65~+Jd!nTL zWYMXJ;K2K@nKHU>FN4Lpa>tbOgkYVz z!Du;JP^wla@4aTy>Yay+Dy#dYB&rpPMziH`eyLh6fBl~8`=%!Fynx*Q!WS!k-nRd; zA$?!D>*}-Je<-*3;dj649vAh@ysIKZgTVbjXQi;FabLly1BDgavPyF*>lHFe;b~{% zWTdH6`X@yXOpYCumN+IOAyCy}dbj_H3-11_Ui(fLp!n>LS?JHp_nv(8>$N*xyggd4 z^M3m1H|q`-RxW>JZneQO>d8;iVnZ2@|Lo3Lz2l`SfAkC)^yvGsp@DJ1fh^A>@8(w3 zXU0W-c-!Uq)lCx@F8=lL8%KC%Z9Z63Ip>`p{-3?`0F0vA`uOhDo$b5nJ%JD)Kg%bw%{>P11MOe*Rt=8=ge6f1V=f7+okeU9{ z-*1k#noN4_)`P_h7JW6Q$xdkxJ9XZ*szXa(o&r|k#c{dN_w10t>Fg3_wX z`*weF&UJw3yfUotdkVAvxkcXuSw6yQx_{clUg^mMhPi#gcbBf*cIc>4PsP~GPt3VK zAGJ#GPYckpIieHvYW-=;r7C zMko<~pkul2)h{HCy!?qd3)k#<^Ox<{5ALySPr=v!xUpa7L=7y}=pXRdJ74`{Mx;s4 zixP&Rk|0|RRC2WSonLl*uzL5V=kJQOnZTOg?#y}Wi`D;oWafa*NwUI>vU=B-TMsVY zxc|yt>95V1WH+9xHC94l9WT9o_w{qe4`KbG zwmw}aOIGp^YX5M0BuNn~wjY}N!HRv^xoN4%7qZJsh5dkk|NL<2&iuk##*e)9nvpaE zL)&yDQdZ;m#~Yuq0{`mYZ|)xBYB*mQt;TVE?gTc)Q4u7;_y)O_#P;}DpKKI=) zQAd&Bazqd31Hybx@CV9>yxjIvicIC-2GkzmB$9-hb?l z)VN4qfL{vmd+Da#FE3s;q-U2m9+-n7sF6ZD0Q+m)zANT0N{lc8!z;o>MVpO!l1PiT zQ6!cUZMRYs=j3>Wbz596OhG*_DP^8-*>i_6}KAd;` zxZYjB`V|h({IAx`y1d`mK3(R%`<>Gt`u5?Oq!Pye=w~Z;94lM@&pQgL8eU(q^_gj- zGm~O8%Jjlzd-K14bXJ$Ra~+N;Dk>T|di=L{j~UrJ19Ydj%{=!0H>5LFpVr}P)v)HT3M19T7zhRZ=2x7 zq}MIqe(3f^Kkqw`qq6^;1ej1#T@S4L*hpK1N#EcN@&eB?e67p#+LtT*LHf~~uN^lu z3*ht3uYX#yZs(YRy`H}HI{k?t0A&j(@b$q@fsh2YL&B->=fr*-MG7lw-}!FU?!4pO zQsZMIY};~79{wdY*tL_AdnN4{cfaJfhJy$K+=p zEqVIGukB{j;sy6;PlPK17Ovf!|L8lPcT0)?Mwyg->Uvl zMYWDi`6cUfju+NCj@LRtuLW$#a3n0vu$|+hZyVMBvdpxu2~p7&V_P5W+0^3fpZV4| zlLvL5F|7CS?x~&tJ>i8<>}JF5qxyM6?E7o>WF|&`de03giuSnwjY$K$|7+I75X+B# zc2VaT`@(rQ7)avoh0AsxE5Co@aEo5Ic=P@M&42aajfoM~Zuh-*{or0N-vaB?bAmA9 z=@0s*BrN><^~H6LYhGU5Eg^c^aL^TtAFkfhD<$^dw_atW&NZ$5@#Dv@96Nqus;OIA z94E*Y1Nr(7+s5_l`r$)2HM9TiyC%;WJJ1)Pe;f7>`xH?lD&3r<4c2)up7orHv`XYmUa=P9V zbiJ?S%W{KTzC?8<*uS6Ycfplu)R8N8<=npTm;Ko)`_EYh9airS?9MB!XmA!))-eq0 z^!Y*KJ3T&vz#o};?U=zmML~XP@v>!`_uM{t)LoOuo@E^V9N7XsH(Ejts`&(M@3NzvYej4 zuk6)1BR*10I0oF#Y|-3M+wJNsn-Z&^T0ky@i&BWq>OC%GG8 zXN=*9EnbTelB-H)LK=)fY_SMwket<^r$MZ-vOr6IH)@Ro2P}CTWZJ7madIjwn=O@t7?>$`n&NnNItLworWpzr9wwvQ3Z4Aq8I8Y#p;-fcD z4zvH|&z60+VfTXDr`|MnaDYDLcu*yUZ#aYRIkF$PnBQ;O_u5x0lA|M^zV*7^8OgjL zl~mWivE=7{M~X-F@A=~2W-Q;j@69DY_w1bV)$@0`e7|i0mHlD)g7oHs(gvU3O4Vk`!eK^JE+>q#Od+7f9rM3})U*q`~Ts@k`L4BOxf zF5g@5)4n4+j+BGz2Lm=Bf(W9-3PMtZB{j-6I3sz=;LO2YE~xWuakY4?AF!YOG>c=H z{zQ%)&;m3ofLjV!(&1;)k|+f@Uayo_2(RM~(mEWaa2(u%f&_Cc%96%F5twpd$`Y_X zilLaY3pqeD(9Ak9LdS9f!wcY+HR^N-a^7Qh4Djk$HvHaVv5XowzOzvO(UZ5Gh5Zs> zErJl;hU4k|wUkL=e~>NQ%OBpNK}l-xHF$DQ`rsNiXFb(#0^Vzs+*kn|bRw}kIb;fe zqRcUo&@HVnrzm{kp=v|1e!^!SLwK^l@H_LE^JGuNV@F4+dG+uqL7SJN-M9k3| z*c)lII=vDVrV*Sq^vc^uL1sG z^MpNMx;njn@Qos@CR~{ce=h7d>9OZO{$bgcy;lz(@Y1{+!rTS7$>Gw#Q*#5L;}gWx+X4HhoWgzy zpkE-n57+e_CwUs!!rdASBfE`2^ybj!FU6_?G^vMv=u|1_GXwzdbx;E) z5M77LlE|0mNdY&WmW3K4*kgN`o!{Zzhf{+mOMVx7beBc~Zv7Z7NzuE06iW}F2^ogb zbDi$FH7$AR{>rMYKC|Ap3!jcy{(bv*`Qf+YgK#)a%8& zW`x7^z;XkHA3unJxhrapssxQ`T8#^fLrMqV| z=MggKFwmzx`ra2ANx%C2`w9U2|L69HHop4|u#MU&`_ErA%QyN0z=^4K`x&0EaQbML zt8@FoUtmB0Tfs+jEYEX-6l6G>=NMM>(KI+;2*iC;en%vnJyFy2vsSsnp%o!FRb^}* zcL47|pAu;^B?6;>AY!a0;Mt@^T18!Ku|bJ{!tQBRc)dJ z#nnF_IJ)n6MPZ!-7z7jn!%D#o2V7Xt8D^!%-!iiIA<%8;29C!fQ<#FS?4ISiA=i_c!-d8wde4utvl4V3M8-hf@TrosF1Y#wnGNEd z`szpC1WqDXe2}j?BD1tcU?i5-3M}Z0jXIV6=Oe)Wtp|_1xcK`Bi|O+P_o5gId_WzE z(G35^>TOHbZ6{Fdk(;jt?B@j;c;pYfwU`(9f6kpYacCCHhXd%eKkvwS{JpQbCnvmm z|ExH>6<{*^SjjUVeGAC_@B3zVNr`3HGaNkN>j%p=eDd?Agh=aSv!;y5N)N9EZcmF| z^Wx$)U$5B)@Z6_s>idt}MziASd4VpR{Um`d+r0PnFMrBNiGT5LH>Jc{c>%1ZscsCs z^}k;>?$6Kcl=#oNQvv${*Yb-ip8I%dWutTbI}5BPiiHmHUxEL&o#O`_0*5~ogT+!58EN0hkJ&ARS;hUJ&a2L5DX;^SnDiga1L}AkdT7hE0NS$ zGcXo#)iP*H@q)DDXzAvHlI)V|ZHLRiRiOu`1VwpK1cqZmgf%NAZe;h=%QMsZrzigP zho`dtbX_@SoYhzM?|IXF;C7azu?s$a=dNq#j5&$@Jcrm~4HNH%&-f%>MlcxQ$1(tQ z^uaa!@qJ(wGHW!MH3~3~-TyORv`>Q&h|xlgz8%$@0_*=P)t3R->H1G3x((%y?`IEh z)sMX$(OWbE+!`ErWEe@}*<*Y7gRAso@4%CKaD{uB{VQY?*J@DRfQdxs{#?mHX7>-^ z+<{I)*YirO%OF%|06nizbDYUuC3_pR1f`DmpSJ-2N^9yLdh2svh@L)r$gByMMcb*( z*~fp{eqejfQAw0R&-?I9$bOC&L3jWD_iL7{*#TI1=e47T_R1hIZ1tYpFIR7Mdi?iH z9Y1sY2tkwp`<*`jJ^%XH;qgx&Gi=s`k#-9eq<=d}53Cjl(X*d_x54G@6dyC?@jP=J7-PA+g0{{Q#e;>Ya06dY>1>EBa=R8`>K< znmNbIw;Vd`_W1$Bx}_w9*$-}oihAdZpDfKQuDEN;*daaBBQ53#tLZPme|w(aAD;XT zEc775hd54r)pUQFBL*{DxKF4pBDxHu`c2T1aGobqu#L@K4{lv7zKj0yTO^(dt@&7TRcZ`&{be(- z=v4a98lk$-FnX>A#Q@7C8g%ano|vg2DWNo1s4p{2ybHC(vj^95M|R54Q9!BTQ}KkZ z^r5w2?}#-Dp)BA`fW81r=z@11h?;C%aW=E>C*V!U2x25v_MfMKm;dGJ9iRWQ33Suo zHigr`wVD{46|nq+Woyf88=t=IhVetP7)}7B^!fsyuimm|cP_(o;gMiT6io*Gn1NX{ z#|@9PnZpAC;D-Nd?T(L^Z)7+a4g@nzmk$Q-|NGNMu;0gTnS6Qw?l3^@q+bUc1SZ1M?| zY&25jh41k!Y9;oAZX8^kvIw}@iU$uad;-I9yoKXugTbKF<&t5Si-u@+$bi0Y&KWyr z{E(BL`SS>6qQ=Zc^j7xJT0m|ru^aI6rGPiM;lEf!l5W@y8lGixH<7(ZX=Rb#{*6#u ztm`*{=y?VBeYQATD9*-{GV$)iK;s8~`i~Yx^do1$A%p$D2sK4WTpHD9q7?Gc8$Z>M zdc&C8B}t$+eI~l=!QHDx2x{bXJhl^Ce2_2Ls?q87!>5DWo8GY$;Fjz=j!5pomE|zI ze^lB3C#6+V`)$i9+zlW^Z1esDM@wmrlO^%FJ7<6{_4IEIcuceW z_MD@+$IBZ%UP;#4OjPHD*kM`S(vxf~tND7(_Ps|+>}JEvaU+srqndXSKA7VbHA~j* z2r=AqcTNL6cX<2ZWnLYti%JFHGG(?Avl`dYjH6=P?-n8-IGodUr_~JGghf%e`>fy0}RD^s&Pt%tioyuqJ@Qy8Zb(4jvXIaq{JZ`*-Vf z_RNDVq7K0R^Oc`wfN|goFb!URh!ZOuF3`i4HvoRX(FAZs`DtKuiL4+6Xcm}af+Pde zL6XB2CYvG@5$#E_2ufcjofLuq!x2urk(G-oG0GZcHkc?~jMb>evE)dY94Xdjj+k>&*=hlzD4WN$Mx12w+M>ljb*=;yi zUC3`1z5l-O??|@^tL+jWh+px|93?rtC zephJYCsHT?ll5!0x_%Rh^!{Kqp|&Wr`3tiDBqC!FwE1~ni{nzzOYdGL)DJ-l#+!vkCgrm7ZB1=R#- z@%O$m=$b)SfSpME0M-I~3zz-XDvu?=Mnp-&u+quk4+~(PBH@Mln-PKVasY!6Bb>Zj z_{5mhf~}pr9L_Gme?3%Q1Yci-nWDiKBn{2|I<60nhg7~4JnN@6OY7+pWdGR(fZ_Ck z6UYS_(di4gLtuea@9}|a#^DbEyTKi#MM(yBgFDE8&m6u`fMIx1vEGyb#pcWh%0mK8 zcn+j^A|_GEigY3)f+Q>Hs^El1yHRhVNF9#Z4LV>VfcG|FDiXLdkwItF!Kd5NCPS13 zPTB%X!ZWp0HOmvEH|Le?E2-L;Ta;T?Bgz_z#KBnrW*ox{y;I_Pr^b!UOdFh@)IBM> zWp~|5>~CGus07Jb$5$Q(>u3!&yvqO$6W}Y4P$Q-zws`Kq zFM!QxRH7sblCxH>*?m@(0 z!{WMVbw3;rN;tj58KVuL5>@CDt$P?gi;XM|dphZQ9_hgvZL03c-0;er2@K!oPR zVJ8wHXqq!3z*k8I<{+g+8-O(pI^00ONOxdO0xJQ$nkh0Z#%`pD2%|p6YBK4F#7HYJ z8f@pBiM`_qMt%h8ueKcgZdYDuqnpCRQv+mPl=LJKXEXLniM_gi=G0+5$ref!(Ms%Z z^muQad*7~{ymjAw5Cum(Ti6)@rsk?Gd+xn|!Gb3re*V#WTXy3oE6o14=Zu~`euO`G zYCeD90LgHa9pYFqoHZI$qm`hGhQPc!;EsVS2%;>BSVX*j{5*}wg|>VRc&`aV5e}Z6 z~C?Mm|>uS84cibOgLl!tbuxOz(;dcPA|jr<@KH*%hq@T)ebMrKPJG(PQb8G zo(D*b0B0w#(8iFPvmS(>8=V|e2!B-KS`g*&5)46*;VYtqHUh8LZqz47SqwTN&T6Iz zEH&C@HR>!nU22RStOnB_AaIxsSLRIM1OfvhJCJCO%Pp&0wmW~#p~41lAfV*B(IUX{ zla+)w$Q3=)ZW__|vdmN+fm0VS9MF2~pML9od-HQ@4t-=X>R>vW78bQ~^S((_o}T~o zBQHF9U#px0MNNMHZR6%#HG&I-PK|OoLE`Y2Lr(e;(VWQVuZ2GfEb!gY5i@1KQ{1~$ zayM!aT#3;-DgAy^+M^TrLr(grY5Ql|FVeh3{z`0aq2+2i_pl56p$&@Pt@C$Z+6QdAcAmoZt)5EHAL41ndbV{gJjQ z4T7fmO;cP#IC)}oy5*B|Applniy$9g^zhM4?)@47qc zIBe-QEt5pV9!JD?69Z16`WQ^k57UO8VLQKWN&fBf-!9agRxPOc&`w!RCUER%MeTt~ zU*3_!Dak7PRbBj2+28)NmL8}f=bA1>Z=ysDtQ9>ZXP zcJQsCImHY8IwcJ1k~pw)(%9Zz;_c?MZ%UG+L#1_{<7^f^^;faK#r=L#VgEnpFL=0B z*$*zSS6_WKC%-^*J~I}=={f++ehr?20Wv%sD8pblr*vYl?3CYvb8t{#=ZeaXPg<@; zQLQqe>JMuJ{@&M+GP`ws@PP-6#tX_KrLtdDyO7HMi(PO^0x!TL>S6lTdv=z&&N41=Z)M3PUbzo0qM z)lwt@9}M6qfx(Fh@F@X;7%2irQHmgpI-*-*6oH|@P>3`cZAL07(hO_}i=n-QAc%rk zH2#c5kzU+^B-p>#=LRo zWGrVb8}Mmb>6w3CPQ`)$L|Dm|WFetSCKl_3+a_ zdV0zF^jQ0CSN8kux02=i7q7gmd&;ySJx^Zl4zYjxaM_!4uSu}BaZN~-{i@o9RQ6vA zI+5>Kg5hFXSX;pxWF5X>qc7kJ&^0a}!wVG+?jXyRIXrGZ4SFMv2XvF+(*s%JB~gGW zGBm6pYEODBdwLcG(D9(*`U^);Htl@Y~Yq0;`U3vL?cKu+r7zDnB z*>5x$eqOWXjvMb;F#nO~+a>$|{?YO*U4=u_QQM^il@g>=rKO~sNtb|>l#+t<=w_n? zC8fKO?rxBj?rxA~j2tnxeS6>U{SSWe+#l-0{`Zl}#S4ztLA>fB-g2*#%Wr!h z@Ly|`*tpMTK5es9d_k3bg`FTnif|6ZOtyT=z9soO8YXWs;G8z(OdhuX0;jfgs-A2j zI5&J)!`61tOgEld`3Q|<&&Jd?m1y9LA#7e$9fPf$ zfOclkl)iC&A1k5gT<$A$;@HzA$d$ zvD8j88HpSCz^7?zp{J zMQJTpm;LUqq2pBTzz{h3-gjtY@&adisKx``td4Rs{63pq7t(REmTA{;JEIe3L zw6)fji31p*sCsIG#EC-q`sZ#gwmCmU`*Y4@he^ZQ+jqYciLmf`BRiOFIj}auSDxs43w}XzQC#H8?=jROx+?9bS(h1u)^=^qQ2U!NoX1&JhYJ2(*WI`j8&~+^sg3Tr=?*{DyH1m*)9A`ZjAg#R7o9IpF?EodcHHIG*Wq*%xX%c%J(3L z_0h=1gD+z538Olm%?3FErVd8ft8r558xAU*GitKjBRc zU2tYRY)b@~t{zk)gfaGnP9dWJp-E}L-9M^HSOWm9lY84$*V8*2*E60MA!2U<*GHOy zL*Vn~6HS!ONV(}Xl&sloC3P0i=78)y>Z?D?621QSy@&1;84HF6+-v~G)}~hMEMZ;@ zJ|4q)ay0oKYQ{}|e?IXOc0d2~kvqZ%<~V(qnZkp#^?khND5vb+To}qz)l*aZG{VzU z>nqmpDj5X`R3R~tIiLAZ*|)ld?TKN$cYbGBHH}}-D-j@*0w`3F=`JT+{Dy^i8_{z* zBpUWMhCt@QBW z#$ET4xbl`s7tYuJMPlJNsDfzYSxq{4R`;pEX*yl1*7Ir}(e8tVwv5gFIO9MIw7&T; zWabhtoAY2+;_#L|-3M=ozmtlCtCbxz%Bj?a=A{Nb2;@zd>3U%N%&!PE z8207rBftZNm3#VjzDBp0xHc3^IPl0aPs&fdXXMEiGno^h?@0c>$a>zBE7;mFv&+7# z%)tZ_B6fe$1Qbkne6!hGUyA`>1TI}~MXj!^(sfgIt)EX;FH&JNn`?O)K;HCU#D138 z8Ay8xMR^bO&Px#DKG)y(bj_#AU-p7#x}F?jx@{uPRv|u$QbCqKnC(PmdiWfWbDYwv z?&mh@ok6dHp1g|C{~U3-GIR zyA%P@n!kI>Eipt~V_J)x(TuqZSm%~uzl$7><8Blbq z8Z%y9(hW<}r0w`~t|XHT&7y(NCNwJ=MrB+^ItAQH#O7e0F81`EoQ8I-Cj-Rbl{w9T z`+2)gel6_B2oZVIW`zo|sYG36SJns;mTwJX^hO{-S)?lu&bp=A^VU00Y8Un@J>erX z9Uctetf9rNXLQwWy28E)!GmII1MRe)Ni0xmt1{VHTis-RJ=@9oCU=0l^eI5MasOqz z7H=eX=A>!)BBH@+v(YWb26&3+dk^ zu_>&8d|f?#KYuk25K2No6kOa)|H+fopqBh%zxHR5VLhyt%9iPt9@~_;!6Qz-JW0A4 zvd`R$b_xd^78?S8-Gm-ujH>2Gv0FZlCp>u%sKmj}Ze3#BlIB!)Ys)ZgbxGvtWZA#t z0DPR$@Rg6tH6S(HCKFigA!_eQ&<5-39hf)+5fi-h+yC-62?eK*_s| zZPx$r1%^;Z*zL~f)>fdi+jsFeHdSF`m~Yx}(#HvA@5cx3^e9@J#I z;YN*_0&|lc*Sz{1V;d!4oR1H;n#MVBYbLhmSH&^e01w@(@Z78%#OBIk3b5em<3QT( zzPcz&Hh5umlWQ5oKI7nhwNMVES^f&r6JJX(G4XGhnU>%_tA`hVYtl+IHFyhO{SaP$ z94>t!eMt0?rIr01ZlC74!Ef(p-Y*N&7J=;#XkgVoB~_~Z`od z+ObD?)@$_5$+lA91=g*2Oy1!$pvg-fjgq;&1fh5$rV2%aB+)@T!T7Jm27cDsQQv3K zch@oxdp&V_c=!s2ikjHn@!84@$!4EjJtDL?M=rRNRu8(eT3aPNc?C1r^L`^h1v1UP z>s{UI8Jp7f_qu|0Cwdu{@f1{HZ7)C1Sm)5S|MDDkL@nJnC(4h-@@{82b z^rHWC&Sv!akbd!m%CP(o#*r3|lDP>Z z357tS_nkg7pJ1;;lrZfnZkfG>Zst0f75|`-x(Z1$!|=2yuH5zFKAF=vuVIF~< zeS$#3%FNMX$tlp7o~x1p`47)G{x15SDEvfpdziM$`#4Ms?750wt9A-}9UniM7=N61 zE-b5^re$|@zqg687Sj*JWdNLSBpvvyI1jvv=HJu6aB6@0loKl!Fzp1Pvxjbqz2;sa zI)!_;8)|Nm{TwogVpBS&FRd_b;{?OwW->3u0e!jWd_l?N-jx-d9ZGEauM&+-)NGmBaEm{Y404_y+T`N( z@{VV!aAfF5V=8Wzr#sk@U?lV*K#_jhzb}M!SF}t2Io*q6|2N+9|M`ENfk1UP$@7UI zJkyQCGS|D9MKQDrvicS$NS`p-EttJGe9N_^rjlQhm=lp}gya1$@PaJJoZW8Wq>V-1 zQ~ifVY^|6257n5ubY+7(%Av`35RjGB`(@aJ8)_6I33-a0{oMO4* z*&1n=`;axNzi?Q3g%cP_th{-M|N1GNI`Pna$ZZdMxJDQA_mK55U;Y(~i01V80r6@FHo0$9b8T#2{YiT1t0|$yJ{Lj%S@& z@jGortUo%u`7BhFEZ{Lo_iJ6y?mOpRD>+{O5cJMt(vHsw^u6NL{Ds*`U{j0H2LM=( z73i#pKG#JbC`B0K7&9qz?N^*9XnnBVm#t5`pFwSZD7T*c-1m`(pWygLD{|y84fkl0 zd6T$XyL{5APHf@1AX!I?<O+vB9OaU!TAuhK9Dn?YUa~5vaKEwIrp`i;_&@d$VJQ$GIjxqfcG*&SP@fI zDsDVbfR%HsoBxk)hw(7lqRpGNf@W!-Nryp3n?Jj#NaoQZ@=9==(OU$C*?~Wy_vPIaF(tL--BH3>{E_7@8hWVJUL> zlt5-n=MJA}mQ=oUY#Zs`7nFoYq~DH(PNr}cWGp)2(~W^Dm_ww znsU_VMP?!L*|7>fNDCEHd*h{}>c^Kg{kyb7K+@9XBki1Df1#h}9^f^_Mrzan|C@HG5l;uao_Do61qr zc~3Md-|EquaIxV~FUSdaoCW5XqZM{yEJ6x56PJpyvy`q+<}0}pY;`tzL_sZuqQh0o^sO1;E|ccz2OL=XJFq8G zDxT5J#E0mpW`pXuG+VIKE`12X$Exube5q5g7@Jp&qY@Z)8E&BeW8p1{XFhwo-0SFy zG=BE0R_a~f-~HKEyyh?fYAFln^Z`t#0~vLO#YO zrjl!}rIT7Yp7d$`E|XHFj=Ej+^(O+CRQ&k@xomyRGirt^dxVJm#R3yiG=Y@{VO_!Cl!NjqU%f#bvivC~U4DZV;@t`Kx*rHJ82;!4u zN#Ns)_W`@MFAwdHX*cig++>UKWA|Q&f?8_4k0>`g&->}GpnA)jg~HY-E0Kxu34rPj zcsgLo>4^jn0dc%=mNHWmN?SNhmOub;?2^-olr?7jQP!>e<`?sH=F^93T+`f|1)r8W zC$JWIkAL*)kZW)WbK5~uFl;=l$u;&bPQ(t?U1XeSKHCq`@=~450JDR_vHlYFa*he> z;3qnb>6?;LNPM!~)&K=^QwOnz*SFq*+oCe>Ul_C|F*~;h9Ve3^mKswCCzQjiOpKk* zTl?x!2)*{~d+T*_DTz(tHq_qwEP?oCR5@gFoKf^^@Sp<@H<4$qLIbJB->#lW%{9bz zr%3rJae4IerhRY2;~Ey>GL|q9?}f8zBCI=fn3ycTI)mSKxmRT6K~%Rf0;4-<(F96} z@n)gTtSs|wc=g2Yrk;Vx##XYt6;WqAIUAh^vC<)nn50Im9mz|DRD!mrOjgr6TS82) z%iKQ5qv@S*+EMdUFgFW1F0YCr7afy}+xAOzTZ}c6Kt;nd^U_i1S~)69k{z8lzP@|9 zkH5KibbNlg2`Zd>X~ud3EmD}k1#W4=7=@ksZx+`jPgs+!l;cHP0DjY>DA2acJQStp zIEIu?a2gwyh(c3B&uyuSDvdtrwZBj>Cu0dcq^-(oxei%eeX!AOjKVmw|6GF=5Y4Y) z3b~$d3m9mO@im<~!gt9pnXAXF39qgr{ec&Lpb$sNGNr(#WTcbt$Y(8AF`r0d8QGYM zDX&xD_{`I--Ph!hC^Y$O1n^P2u`;)!WkY3)Dk#6=RUooKry@Z9R^#P^b=2_j2a?Le z19x}W=)qkSP91paFfCT{4m=4UB^AQey}D|1ymCrKJX#)mqpIyFPZxjeM7vqX(gU*f zF2JP``6a(MLG_nIsB z>X5vd!OVn|g~9)oTAVdgpm?Cw=G-06MVuCeT;j5s$Z<5fELf^nLSPKl25p?c#oCdz zIwP(4Uh?Mc2C3*z)CmmYHEPEac)%hUAMs5v;)XTP;r zRwii1^5hfH+Z(3hf?O#|%c}I|H`1klH_`nQ!3F=P#=8L;x;87RT=}F&J6gEdaS4Wk zuNNd}$k6L*;>iOj=D-J&Xb!uLvD}lc+Jj1rGxEjzEH-&Oo2rIChMy1KmQ?WCUz5#? z2jsmV&6bOOH9{rTagro|OM|IsWIIM?F7)ve)0iu8`lO9jzDRy~$1&N2V_GGS(lp@M zs2Nb5@3|LbIeJT1<=jfh{EUnITpd#m4X3~T9@IFT^*F9EaddU497ld>r=*GY%|mxL zUj={JFF?Cb)^;8bxF3Up!ji3>6=&9W@l4mN;YVpy6WpPW}+E~uht6=dJ@|iB^p9LPs`gooARHk z4H>%*`-M;KwG*j-)m3({<>-5=+`yQu%Ijhw+!aqc`Ux$Le6cdly zt@dVAjv&9ZinaPttW+kxYT$xfVgEL*wcg~OJ($cv{l$;n&_95oT(^`1w|fiN%rR%5 z%S}rSjeys@L^JjI{<}Lo?(x+zsFR(;fiZN4+abh1=$ zxtoo**aC$YC#+gvYmSCe)7+*xQhN`qu+eyV{GFo|(w0Z*uX03dSY7pcNdidE5u zl+$4@di0O~>(-97i|b)wj{ zwl_^@5Oam^OhvJSV(0pM7RB^k$U;S8s?3-83f$vf&gF)0nE0g?)1B2N$L$cYQDQn4 zK7$n{Y)jfFX4e>z-()wVXv1&LNEw$Glzed;h2f*O;1}}`r-0G2$OT9zv|z9HQ{UgS zET^`L$wK7%M9WHReK$=FD&s(4W|uk)g|G2(G-8HCDOP{oy-%mp_h#)b`uMiIP4ssi zN*3pA#aFYYWkCx96Y>oixY6kGP64=gOz1(bYl5j_2Kx8}C0jML!w#P8O-q%{FGYtL zwDXDe(tLAFmtwRR!i*JWnXwl6!hgeT<+FSiOg}UtyiY;CATGmspHBEU)Q&n_|5QryU<7GdHv{G^&Drl*+T6Ok1W$UQQ(Pu$>6|4< z%*suauwav!xJx;W7a6E*D1|Pp+>TOv8Ej&AEi;oM*nto(UO?327sZqg@oCMs956hnTwl<*`yNVS|YYWkdG$Va!h{Ki<<~ z9YeXeMtMpx$A7$i9hLdYIe1Ql8?-W&{qn&B!oc{w8EPAzt`>;cy2P!f=wnx5+yBCEQ*Z;=}gvaKYC@dKb4(z^n8Q+9GCmVIqCj!gSOEFyTx`wIf=~@L_RN{p7)H2}%dt|Mqk^cb| z?fL2MDJxu$6O>O0oS3 zlN8Xy9cI5_+6li>$7ATQzDup(A=%%|QpiW&f))Npb1}v87x}ihc!RdY)%%M~TgQgp zF43Hy|G>=|P0A!X5{1R_-S_JspH{D@Ot504z2nBoNPx}EOXFFVke+DG$fkkSaV_hR zO9E7|U5}R;`Mj9OKXRzpr!L_G?+}q(HuGSaxmUhw%v*Z*&`5(~`&)oJhlyKVWCYvT!q`k1Hrmc2^ z_8P|)Fy0m-cz#DsWOBbPe*NQ;PWn+9fS!4)9Iv9a(68*4zs}!a>}sSNs-S5=kZ4#P zDNOs~Up#Fdz|@hO9pS!<%&be__24m+F<0fKu<$W6Q>UM?96aCXoZU%yQ|TCxN?8_} zfO!8;tPWLP8GeSi!!ulP?RX!zw8b;*6;107>G>cZ`8R&9 z_MRxdh_s_0vZEnRT1b{yYt#TZ3!3js818tJSht}(ycraa3Tg%=DH!tDTyT1%#a5CT zETK8sOB{nlJ&fja_CiK<^Z8&Oj)zFycrHEh4pX4iHDqGVsF}C_ISq>d2?mbC2Kz}v zUS}0;X4(te0SVQFCSf?|Z8xR)Y$F!Dl$^GX&GdHi9h^96Zo;^sVi3YstK$l{c|kbz zTsm29s}s9kWoAn%IA-Gb@iBV$!BZT%CjB3!fR|w2)8Ic5CFw<64y2ilT#e9vJxV6H zVa)AJW*_5I{?9%I#T|bL&2(nA)&eR>xe~YOe980&kK$emooo^KvK*U)Tl&V=91a__ zFIO-XR1QwmlAw#^%kGGWnlGnnSyo-;xQg)x$)Z*%J#ZGp2Blln zX0GD`6g}0@t+j&>S_JNOuupb|V16sh`;IxeKrG+D6`SH z8;^R4)we+R_VH^U2*b%t@+uy!d5VT)x%B-q32A4=-fi(wM|+IkBq8Y4abm>QHXh^4 zXu|PdF}B3ldY%NHWkrQ0ciS3&gybN$EUIpC={DXsuIf)8I-WlS*=z4!Q_f3#M*f;* z)Q<1^v+aW=mn3$LzO?t-o9-OQjjT^Yci7(T92?deKZA7+&__ph5B6{Ne=`||i!>{q zE$#=k5m2->q}&Ok25*j%CG8~+lL`SeTR0Cmh>j_ku{cFnTIKJBuc%H5x2r{n0Z`Cj zAo)y@0M`N$rjBYkYI<_TDDB6cgK`A8AB2zMXw;><_n{T2trk6ZpfAd&&2GH^Uw=7A zEsgK|S_u-e8t-N+7s$_9MXV^fkUnl61@(U*i$C%ywh1KQ_z3 zu#@$LeCtN->eb$6fr=V7jtKcM!>({_^k4M{$B+hDUX}1mqwIU>p6!pxG%BMzLyo}> z2_b1IGzImXqQ3UfgEgX#MtSR^`6lJ$Bl@nZYdNl6y7|!ADj*B*`g95lJ52wVxd3T? z5%D<9s)Q7$5JLmln9=s642=wSKmJ+CqK)#+U~h8ftrRh%6>$nM8?1POkIaqPlmGz1 zm*w*bxbE#YBbb4Rh0V0X>i_OnsD_XQnd%UuQ zqsmR<@wH*Rp^TZ2%*@~Tr)tyT^=oprLZT67MpV(&eJ=DtEb^)FLe{VSO77}E(8l2% z2W$GE9jdU>TJOpNjK2M(d2kntp9f5fx9?I_k?C1X@z$pT9+nl@^^HDEBeI#6x2XM2 zc5da=wAn3W*oJ(7;)d~S8vhopu!E1;;3?oMDyG0{WL^3Q(k*TT#8N93 zmb=}<89liD40PAGBGm3iAntm^8Jo*ailh*YurQe2!#w1OsLPdi5W;VvVa9A@ARLYS z@hQ^s9nxr3aE&N3GrNO#FE2O@8?>XMjw8+Dg5$l9&JR`O$;gCh!Gbj^{R=H2x>*8T!>83xnx&sEqwpplC$;{A08+%^gyIz~ zD`pq;b93Tn(XnV7Zthxt*4OW z(e)zwwUIN1)J%NPGdgM9a*_<#T)U3~dF*7HC#;YJ`BBI6*kg$a>fb5^|ND;<7wLAE zxW5Nct?UPFv0yBvNUSAAo9G8tx%x&NfAP>TO+w$L;)n0P-gcHueW$b_n3Nw*ku+Gs z^k(eACi3+kYxSQ^@~H&2e6Tro+P9%YWKI<=NZHOKP~WONjGe8k5kl*&sy%! z>L)KkFhv*uH64WD#B#J3to3ll3LrX}JGAN7a@P&0n&hmMZ~Sp7uP?0KRvvbnA!N>M zFoZLd&_Kb|0aldGh)KMmke*Ff3Zt#nZ)#2fD9=lnt6 zo{86)Hhlx!4;C-R(1Lur86uObt4lnlUei=i)7qv?bF5j#O^0id(o=r_+6MfR2-&1@bX`Qo0*W?c;r- z=j!ml!zk*(?9xzTx37yn8#&&uW#(T}BH>wVRKEX;%6Xxi*ymd*^mv=b zZLEAF$m_~QD^HED2AD%rXrFG^>98aX!|8Jz)?7ORoPxBLy_>5#IZJUL8M}-Ep{(-tf3!0bN!A=Vyn}znzJ1E=rU`P zzKb4oZ+o1CEmwiD)NSoGkpKl}#nC}8@Y)_HR*Fwg9qGYuPyCTuw{RK&6AgHY&1iNIX_ulcRc>n3yb99>wpJ1c@!Ya1#QKfE}`-n5%kjJ2OIEZzt*xgwNGVV*J$ zDg9{1wUtJDF&-T{k!eF2L9FEge zPUlm`g=gvMd%?}Wbg;n=zaE+kOQ_Ey2iM@wI)N^s9lQES8$jQ?c~Y@*6CLo|9)BLM z+uuvKx7j+SvBT_20MXop#r6W9z9}$t_)PI7vjo>6- zFDLZ=Jpkk0j)~ELa%V=LHM9L0;QK9mWM(hq|DKWI7TC>bTL4jt4e-eNH@it>+M3aD zvb%_o`qBvR(qZ;t=eOIfI^ULRt4bAHSu-Fb&*{2M37HV42nQJV@~3*z`UrA!l_myU zkLKAEYlDoW0aJ_EQb!-*`VJfR=62|~%YHL}NU3##xW_b5>h522f%F8-p5lXRQ=_)c zOymLuEHiqA&ayaAs3~RlwrU6T1Ma4%@|J%I}hPe)wGl zGVK;p%lj(GX4f~}e+#dv4}SxUfxK+kNpcYlLflL!UM-dxT|CeaRc)v-(9zU*BEOQy zAWlLoUWkMCQ4388n_;Fef8rdgbIezJ+#JBJp?(W4VujJQagAwz=U3J?*8UERtWAvl z{WC*`087}r-Gz1CtBrI193GKyXF^&V#pveEjU78*<6VnidNeR>%zOFY5`i?D!WgUt z=J{8vcb?u8uI41_UvHmXu?Tt0%?=sMSKmg6yT~YTKNqQ5&MWxz_CR}X=@$_+7CgK; zP=$=2Tn8DytS8a=q{!GAY%47-blz8rAJH52aZdHHP>)@z4QV-DzZ&_J~s-OKTLB!)V@aNQHI?f)>$l2slA+DK5R(>Fgn-O zUc0&Vg1WsL8EFfp-?Eogfc!?zZRPxGE7}mFBGATWLAW!lCGf%YZ=Qut-3HP+JSXES z-RGqP_c--9Ctf2M+G;}vytdb$%-S+@{PTJqVtUbM0`@Ad^25_d)tQ_r20^G)lxv%8;(SOUBA3v2TqpTCtsuaq2}MV%_AB3y3&3CE#LCLe7^Pm zmitK`f3by5pQR3IeI)<$QL=PP?b?bm4c2C1KG{x2vMutdiH+HbnJFg61tTT2=<{Tv zzi452kNH9uu7VY-F)HQ5rUGYR*J9@*-{|a|oy-2BhzDOzqh17Spx!{x>5T~)`cQ7L zUuJkcp*kl4;urE8vr1y}=z`MdkVPEItca}7G420ds-!_d&9lu}v*UmuTP1xZw5>fkzvv;2q5=5j>3PeMl~ zB*XEJFKZPnbhJG&>eWeKxw>lXBWAdoL37oT2;;g5>w2;UkgJZH3$6=Tgyys*Xn)pW zlY{Q3-vS#}?B4!sHOiQTPzsjR7l-%)JNJNeK&)CsR2pO>ioFN zSIBxEAm}$G)ze@-Mru1l0w5(e|ME|7xCK!CRae6*9*3f+_UL2;$1@C9o|K5+qd84!PJbmRN3N8b!qwCTeE&MHK3NK z)?~SokS{!uYn7sc8l|6Q>Z5AkQ_cKwn{El(;PKQRZ@Q%fvXwP=dbusN+VS~L*aJy? z#o4J?yGuHZ(E1z^m)gb|r;|mx6-}+LAoJxExaP(WwP!cChQa}fvGtBulG&PaQ^+O^ zJplh}0e<9mt6wSg_q{`v$n1?07K^mrftdzBxTl=BACdgD2+%Ej&z6k44q-#X`X18# zTMf?3_H{p5FBbnsgakRugIs}W%0f{86OK^Zn+bH~5$Y*3rgAL^Ob z@*=1GCe~9$%Brg~EEO85qUYBP{(W^c2RfEzu@u~l_}LOKI!Nj2uqjA6LG7y7gNc*6 z>J-sK0+!o{(;`_3sMjT4tWWJvfiP2KIhh$z6cqG18{?ZNA!(M9Q&HX0k8dl)oc4<` zA;9&zh!Lh8foj&T>l0&~6%|^HM+R3S6jzgX5MT1icv8d2Q9$R@prRZ@)3sJ+gKJt=Nq3jMb=B@-IbSz5|BTH=gV zY}>{o0$jiR&S{#130cI7zG}w6;9OCWm(fMP=+InF^vh;a0pYXflf0F=dl3)7OUDZC zOh{#z*9n>*W#ds5QLzT_n7~)#3YC1is)%;so?cD$Q4wlVK1@wT8g0&NxrR*eT7o{I z>s!R`X`Qw0$p-tPR*+n=Og$ z0Ie6w^4`%<)i$XO9FpN>4%3?^*mba47#KG<(9gMejU1HSh%}`vDW2TD?{?vQEq%Me zz^P+NzVH_^#E1N;Hf=e`W8C<#u4oH(-S6XYOVP%XhNIJVI8To2Ud}F5dXul1RzsQ? z#{s1Eu3c+hs#Use%d0j(GNaF0Itg*cp+#_@1_SbP%Y;4QjWl^id?#Wq>ECHbe4XkC znU`;Zn(G6)-=os{{M-xsXX@Wi*Uej#_5bW;1VxIwmY@J=+@01b= zC*MFeJ$H`v;TjKqZ4L&K^V5T*ji|Y3KSogF`gorYsu&p0thWiexl}jCyCy`OHrgJ3 zuXOB|EqxyX@PzrIvcCZQ0@?ycTm40j3QUQAvK%|=*m~ULbX`#|%#1i7c1qX9?`(z^ zD&E75!!u__4(CpP+*RNc&qS0DykSUr^DIsF=Y~uGU1hspyT%#y)avB;RG&lE^$IdI zwB=MH!;$;0p{(`*A&OcWfTKey4I3Xi6>S0K7A)Fg7w|k#s6P^^b5J~P@Hp&uwIzV^ zb!88Lw{+Rm!ElCx(u*3Bs2P3_Z*htYlQI zvJ4_rVLPGPv>kMdtJW<34AvUiwfq-#Q*lV<;h6zAS|jNikD>|n@7c)%CBu@6y8`(ECRHej`sRO5z| z%k{(Swd*1DykZtKWKZ}|CN0A+kQa_ZqcE9l4HMWWW;%BnAbOSEwMg)Fr_w~GDoPdc z`o@<`?ztf+GC9hT9>(1Vo{~G4EkE4=KEz#6lpL)iIyCNEzJ)%*-As3LYu=63xs`7J zme4>R>&;sp`+@z=rsSg4Pu$a)Lz+7F!Bd$_uE<8_3qA5SwPC!e>vJYukK>^3Llg&amss<4NP60t*}G^!W6fuGQ6LQy!xgjD#*TP9J*j? z<16&;_;X5!qq@QsdV{Cr#7gmLZf=Aim$7pg>GLF7V4Tc3-E&wxHvqi{qn{-7*y+=s`9dYU{GFNSs^o$pWw9nA zq^-TmJ@_~73?xfk;6NomC0e=IdhTEtirxcmYRK7A{0 z;;zItX2C4e9T9Ks`?p0bc-KI`(u~JcnU6{Ey`6}f{&X=E_~A8sxtUQt1-}e}iMV<2 z4_vJNB)Jth`vw3RXqSprvHYoV$ojDV)mFMxX$X^E3Tr*{lq<^gv90GS2!_g9AJUl2 zM7}NBz@h=W8d13OQq#8kwmYr2t&Pr-tZas@#M(}J_Pw)6lMR^gO0ovN$joJrpWJm- z2@azMzYK37(ZpGV2I94N4_k#u+(Jp32i}61J zx@FeB`KzA@RyHgCCb?_a&bPu#@tpeCNB$nOOiEB={jt*<1RJXLcI`dQ^&-oC}z{R*L0^g*u*J2Ev*Ql@-f zn#eKQ02Cja*;KO7xYEQ$v4Fvk7paKAWS<(=ez z565>UVnD1d&Z@jgDfr$b8L8!_ztN4vSlRpqTJ659C&b^^Ambzm=*Gn6+qXFBzPnGU z?n0Mc^iElN=-N*G?M!d}%ja+H`vIwoh`jut$jl;$j&aR495r*{xP)srPKAs(_L_|W(w4)%E8{66z6rQ z_&DA`S1QZ#&&B0)rxYMZ?Otq!;AltS?cd_a_Jq9fXGnq0>>((uvt{rr%37#e!odp%_JJbn8kbGf$4zvm<=0*K#L zh{yHzHVNM)LmJ=IEwQ6px~|(D*EgO9w9k$PHx^<^FJnzxrJT;5Bj_hl)cWPUpbfWo zB5bjJ2a8^+>{}=r3KM;8SL9zi8?aPo!=~3%%Ula{)Tq!k!ph)y7CW_DFzgk3=1$^z z>kqw@=IKW!?V}PGCQS##!qlz}OCmvno-T;}?Q1Uas)|o&c(f~8rJX~7C9;G6x$*;= zj-q;bdSYiey|eMJC+Dyou}re+C?DOK1@L|Q{BEdP*aME5IJ^h0xA?N~KkEGV5y0Cc z9=3+DqM5>-d+%aOm9b(6==p*_mN|o)4Ejfwl5aEZ>0Dd=%_f}~J6orwux*zOlE-;U zqlW)V(9wh62&eU`bUkdfzfHFa{an>vIq)|ZwkwB_Xj!86EzF%hgbSd8%l*dXWh-g{ zS3~lTvndveU3CugWnP3E$T+vnA8jB6X+t+=X9MG0eNMMYULmI)iy=BX!}!l@ogTM* z>V4Ijg3+Ib#n7UngOnG{!w1@eeGyjADLH4IHmU+7e4`GJcg-*-YZnx&A>D3(vxPWD zhp6xFmd&O;2j7*}vpWCn&U0osNuas~TqkI?L>eGoa`OVGX4&{@rUozh9iB9F57cDA z(b}~<*d}T-BWs|#4Bs$(cgh)%ULyert6`T6&p1W2e8Qm)+_ z-oBFd^e&8nV+voYvAeLtVyENxBCjcQYC2`EFGE~qi=5QwEToWlp>pLMsol2`UV>;Y zD6pvZhgd+D`Bybntdbypoz}K?#1LQ+Gyiv0H`UkIjNx*k6Ptk>Mdkf7CnpqMSDta4 z^WoxV1`EnPjzKvRK}9-j2}9o~1AKN?vc~*NKfl%~^P1AHeajOr`0}#-zNJH)U9e|< zVollRFb)ebqXS1nsN{ghf#0bH@uONt2WS<&in6`^COF^^ty>W_Oq!1G;6{LL^1N!Up=}fpwo(+uL@6R21*H>d0@9>~7C?$r z=}lSyQ9uzwZvvr6ZwjH8sHg}Cgf0n1dX01hLXv;6-QV+_|N7jVTQcVwWsUXDG2bzm zbCf{-FjJBGZHS@wlLsFvMn|_t!~hbK&da%f{WoW@#QY<@67p4rug?!BvOYQz-CpG4 zW>T<3aQqeTa*2SWtQenk&m@ETAIVK~y#0c!KR?9dJ7&)8l9z7QR0?jjZx%7|F+i&o z%dX{z4h=ii))=t%Eh3C8mlhunK7L*{< z+xDfuxVk+> z>Y^pm1<4-COngXJbz?+>rEfuGUoa`dWJz`<;3NnV_IjJ=r)rWzq4`o+nNwn`ftJaB`+zGAa4=j7+O) z_!@gno*vtTs{@xC{&FK-60nTGS)Utt_P`o^r$pn zot`P&xGc##R`jbQxaoX4{Gsx261JACO(m#3&c<}nR=1~fh6bI>}h&pn-7hSAR7^Q&!d^Y2Jtany0MaEg)% zk=Qm+UYe?W2Eg^o7tQmueYLjDlii=5*h*29EZbU|WN~JfFo%8-VpVgM}(S)iH|#b<}=_l`dbpjzb*&C^ImE!zIt7-gA$u;$;>i*2oovd>}xy`e8sDy z7NRHDYeN6xV10$z8Wa(BAWslW;(+wLIH1v~FxIUJBJHE{NI_`Zfo2pyEla0lJDhO2 zReP9zyKd831=G!c>(f)-Kcy$<$@9zqzG5`5bk=65(`gnYD~;c0*}8OPZh76_btUmx z+N=z&rZmCCN{p{bBsl#p_KeY5U$jWH?8*KX0~VnmSXp}3>b;q;$+v|b6h|Ctisc5W z%FH-NQt%E@8NLKiI>-k?ba3W!0I}F432q+j3#;;HjG-eC5ve^5C9NGU0{u^GCF1`2 zJderP)z>SKpP`6R;U99X^*th0@(4K@Zc!dlqL*3R^FqZrMNyNX+eNjGM1R)Squ(rB zJGFIxZ{ZHjsy^<9AJBwiwTSeXT4~Wcarm);*$T3r?v6b{Ho0h`V2xYbCu+YzQ!p6` zk$uz~H5vuhnAa~W=c`>oXN$#K3*D>{s9+aLMIWELY~rOcG(Bm5PaT;a?7tK69ykbS zEa&Hu5-~z?rHDjU;lJ^P&qlmgzl>$Ax6wE6%c!sA9S1Cr`IQV+S^td~JByCHd(xMc zy!i~X=-D$#>T*-Khzf64Wqu(qQyyD;u>RLq^aQ79wld0god%fzFYDJXZ_7PfD)@JWr zCp6S~Ww0%w^3%?nRT*NnLX7S}s$dE?n10`i+hnIX1y$IW&~L?~a?;Ww1EdOh#(=bb z*Y4u)C$`$fleJHdpDA6B0_)ZlcOBRX%el-wpAJ3##?GKJqupsx1_e1KCnFS5VM_~T zy%$9vRhFwYLT@WdocMb`fgxO!%Cd`_-v(iH-otG^FY6H4u{DK`d98)fIYmHnWPaTV z^Gi7`40X`N)Gi*-rs(`t%krE;wu26eklqWOe1{;nY1(Gyd+>?>ZKv2^Gp}dSLFVA; z<`_rN^|7TGLs~c?2qA%zwsIKvxZ8@Eq*B7z3AkR zbu?2{Ynd?L+gy`t6Xn%R z8pt1^pGg&fS8A)YX9uGLuHgFyx~cmyOj*o;7XM6+0N1A+q>pRpPU^(J?tR_YGayOOwEn?4R2?$ zij_S)5_$gY5oFnfKI$)YUEOVt)2xQ@>|mW4S0m>H#TG72AYLjbW?5fG@E?69EPT%9 z21gf2E?d}M`r4z)w(@HPIIJ@zm+D~Y29PI!uw)A8KNH8upUSi3IFEp`gx-vfNLe-O zb~BrjM(P9Vt~Nqd@NvVH$!;I}1XTX!T@V@c$J8#9)Xpb9KdVwk1P3TJ9 zPleJe&cY@`3{h7hI2+%0Vp+ceJl_v;#c+l@(4LCJ{K_NULo!VwQp=lKevM3xOr>uw zrkEIyJ#d1l{gjR^dk&AUs5hHob{G4-Hn-{0TljObCol8AX7Z@iWAX}akO${k0}Pp` zV^ln7ZxU!DggDc*5_HFkujaT}|C$Q)6%T1(F$eu4t|jt-M#1IyphQ7bAq-0T*7v*> z36@|ITJSPWeZXy!_3|(RtV=^JPf!2U&$nJ;?cdZoPz#!gIq#0G#sVhsOv?&WsUsO4K|hKp(nUb?S2Y~>FFc%$}Eyw2)`=C|d`$Hg>Q=|9F+@1r5=l=ITX zpG^C<&Tw_1OJK%jKW6AFJH96`mLF>=&YR3;Y2{m#H)pJEdc6CUe)0xoz@oS7qi18}31S>kIeQkW>xL(aY1$N#wIv#hlGavkA`=21?8>lQ>DzEjlaw&0Nea zpv%qAV0y843O@~cGe;Y(HYM$RvT7sUxe(}Uevm$EvfJzWz~wUoolYK6^@W$_A+;c2cYvx>58{3@%h1Fve`_hVO8<9MCGCW4W@tpmbR=jz9K+NVu>j60C&_KN10 zTbcXn+xN|MzxUuTdd(~0I(KSaUaPdo{McjJLCLhCUdABrD<|3qxqn=$37~L{`3rnkqoHFDSKa2*Y~bk(XfTyG~>*hpD@s)d;f+}5DvnQYH=r?KX3gv z?&JFBeK{-9gJN!_Kx4P1dE4{IM&F%x!-KyK!HdLZImV|(W{=$*lx}5lQfh6fT6kZH zRv*14t)yB&ueQb%_vf@v!7D`sUHG;+kq-o2qYfBCe1^n*?9Q9Z+%s|H&Fk;t4QV0V z%-W1T4wN;uFdBdGtN8IEhcVMC$2*sPTG8lZSGeW{yWQtmf}{MX;{FpTPv- z?f2}t+QfG1nGEsIKQ!+BRD~f1TL{sHKHxAi4o&s(>SIhJ#Bx=Ne!xTF{L%6Ae8#Td z7qy(3)uJq4Pd6>A&WLu{Hs6-OVDS^+sUMR`JK3KM#2kq)Nh(Nqi?M+vVSeL7Q7KoS zw}%O_ewV+#u5omlqdh48U^xYLzi1)5?Tz=6d%3nIV;~oG7oRsDKzGPaFt4mkV+z%w zVS1rRu5f_n`Ul3%^@xS@hU1&^Fhl@ltHb^geFeEA(>GZo5BSWE z_)5o+T0M?&8*+}SVSCiPXKPE_yeEFQ#`cs?=O{AxjZdmxOS@G{f=1_XT&tPhl%q$C|nrN;^3f#W2>p$zZ#>Vchl&1dbW0`^Chx8ScL zVbq@4KYe1f0+MHrgB<1PD>kQdV~Y>D7-qbV*O2PmH_ojzI81AlG1-|i@rzT@%LR%2 zinZ*bk;jpCW-H5aC3wA`#(r~1gbZ79Mu-2$l)+Hy`ZrTv0wJV>`L61zK%btiRh1yY zRSMAfK-85n&#bbLtWR!?H7FySa!sx2HGrYsi!gP)WuUU}-Pi~d7ml}Cfr(LOv8$y& zI8}l$ih-pLbxI^fQA!yJib9&54D0ipx31H?(ZUWlagEjgNZ=1(*?QM_?Gb$__QpTS z*tlhd3#9~)+-jcG(X!t5d^e1c8kjebnjao_KUY^TF4y9$X||Z0c;?8+bfKz-CELHg zUa@66PSB6oeHL<3gRWiboLD<*G$Jn~Idxc>S^WN*jwy10PVdZq!ZKSK`z6J-BCTKP zWJnZ*ZT5HEeqj)-$*fjnzw9pO_vO23Qy9TwEpGktr2b7bI%y(kXD)YFs#05(m3}Xz zEVd@0t4{pRU5>;IhU9MJ2lq*#lrT5SR|S6R(8)28poyOEs;wWdnIjgwzaJCS)%Tgr z740NZU8C-Zm|sP4t)EydTV!~7A02(a#jR*PGjmKSIap4G`B@!QWfzBXRYuBZJMI&P zI=kv;(=tG4rGG&UhZ~sp#5@g+DyhryXH+EY)MKvRCqPdo)>AP9U+CmV$Qd5Y*H z6i+x^psz#+H|vpV)E|iwU3;r@7MK@THUD<%h#vzN%nhgMUY2$Jti_jFkJ^+)@bhU2 zr@l8hGMvAMd7)yKbt!cJKsYQQ{kZB({llc)SG{5KA)hI(0~{iB#T;AqeO!yzD~ZjL7|}=bA1E;SFcPIt_Mq~GY0Biu5qXQnvdqTi+b*F zPVQM*8biEDn)W(!H>)~KRkX~m>f0w}Q{}0J|C`%VAcRc{&fL`}Io0Qe?~ZTP%v(r3 zrWSUxJr;TEEGJNx&yw=KXXZy(mP1ZyYoiq3B9RrLogZA6OX18Vu_khMi9c%|8;9*! z^K_)9cr7Bl&Ha^>%;;1kM41~%E&drYmB{9o=UcRS8p2eO)x8$FMB}W`J5-=^esL#ei#FSQriz5C!m>pH?PhJ3S?TdMg znGsvF+`YwS4Qt0MkX|R0jdacEZ&pUl+rB&o(H?zy>}UthM!rn$Ssaeuvy%3|X6~Ue zAOxqpB@G|bQ*K@0x$Z)FQ8k-bRZjwiWB!u>V;V@3YJb;+)N=MU&t%>WdO&f%y_l?3 zuz1AB&Up94$e4e+-oLD|P9Y102w3?(y(@Kp|EyKxqmZh-Za#f4*18?$`;$K(judQ1 ziuAp{0$xy;n6JV6kX!0AySH*9in*`1Tp4r$p<=@z9NWF`V?QEtQEjqczE0A{*^v6} zG+B1X#Ij^>E_4GIdy1Y3TreT_Z%7CGbw722g52o-ll+bn)O!V9O?q$44XBIlG45Gl zoU#%V_)Ts*4T=4nmD8R1(InJXQA2c$Eiucy>?8Ae;th-947JbZ*C@i6ROTQ7;L1i#3~hP}fqSK5f}Gi}|&>}nf*A~Q<;DuM>N_G|^(9lt6Ho9?R7<+j{){V^Kc4Rk&B zgD1pF+eOTC6O21-mfycQ>umVf{!`Z;4N0bjM%IgssFpbw$2(yg*VJ8o-Wzee$@ZIt z!PTlvyTyIf_kev2_WqwOg5|n1C9Wyu4hB5u(`?-?qZM4*al8mLX`$=H8E5ujod=MZ z81hVq;VlhbL|Bv(H;*mgh>JrQ5Q8;`W%Z zcoTZO_BpCIR*7cSPtr~BN zDj*hh9T4x8%g~LtAzT;4E29VEhZ_g)#IIkxT^x|0t(43QVBFlak5rrG#!}p6CmX>b zcbh!R4&p{Vn5;M)b)dYif=et+ebc_|0gu<&SlYbf_fSj`Rk3M!e2+-=R4L32+BK0r zEcyXE?-H*Jd!j3J)3`BgwO1l2;P#$p)}4vS*vh8bFO87l&!!&?A*H(9cJ;F!O@uKY{AZta@!h;Ioc;HuLQd^`Ba*XTsEBgtjeAKd3eCJN3@vI)D<&_kSPW#Jp~Q4 z%aG@=GQrzdV0E}cm>T?LUf!-b_mArC(B1@xyU&Xlk0bM=$J#vOjy^&lJr7a8lF5ES zH4_SWp#`19+_W?N01nZLa~Oz882 zgJm-Fu56Z8%0?^&2aA2X^%wnv=!3|wv;d(=5x;WPoiK2XMj+T^_VOHf%5sSq4oqLxe z2;_JM?)fj7vHfgqRfU zA6@$jf6{poO4%8e0=yUA0kEqNmH`GVGKz(fFs73qe%Pol58kx3U+AkC7mor+W6&Nz zAegi?pysxnGPOgOII(HlfoqP>R2c~LpcQtsbq%?lgox}ak=Ia7R#ahOm%MUeb!D5U zGmP{VINk_^HV0~^$G;B-%o}9+=Fdl~a0Zp;H79hls}OyyDbrV?z-K^tqrf6I3T$9N zAdr(SF#Z`J7v$s*7(fR|0bw`p5%(EzEFe(c1i(<9S_isUM?)9~Yz2YB9q0iK0dqj0 zK7ePd$s@^j3snTc_2cZCQeS?X0vcvJsX*Z{fDHpiG7A#GhphLAO4%4xdxt z0%R@zBWt`ni{|_lq~-Oqw&t6%t$cA^@xI83LvPUgJO2*XhAOwyx8jnx=F1kADB>gea$Z z=mx`I;V`>#-NQuRTYH}7SN=6K&qTO)PCCl&`LUQZN3E^T@%-(V@NK+nvehb{&M=>1 zQ~8STESxm(^9!&zojM8BF6$@tcsKJ+>85nNj{wlj5}~amA-wMFVdLuXnjB8Hb>=;> z`Dm)enwU?IpZnLfp!V`Kjn;O`aOa#iSgin4aKgGKpjiBtlMTTpn5o?C>DgGb9KVq8 z_J7U^#d|IcmDr@eK7>o|Q-+}ePG4~&PRmzVgAdUXd*T)}|FOZN-XJi#v7~{ zKw$mz;?BxfS8{f^;|u2=fTM>*l@gPLR>!0KC?Q|PRgl&DK;ru6WzSb6kHdboKcrMa z``&RYZuamq%5MmtQ$UZ=;A)cZ(2Kgu`|`lOs9Se?G`P|I12qiil(C$rKc;u zFBs~k@f)!E_Fj`YJ7CJ#3rq6=-lWejGecX+XwU5YXy6m7nm{glRO5nkubqi{9QSn6 zb^T)Vgz2TAhgA$f7w=B*X`#okRybllM0EME>hrhf(X{b&Ei(P!zj+gdOxf=zJq-)f z(>uJx00LcR4?+@-W(Qbed(F~9{X83W>Y`Ysxq54EDG^as?`oG8fy07YGi6wza!4!ykx4T@;dxH^Yq+DZjJhazl@%`9NXQ0h0 z%ynj;-oryqsZKEI8hKAoxZ=~T(Y0=x^!OF$6&q^MJs~5Z)L;yGab!BDBTHcjYv=qT zk3_Nm#*Z%;_Bmvm@YFN(4HHO3Ba{6WvX1UkcL3!|zzM^N8LD!Ig}LIZ<1|hbsEn7~ zYU(Dl_s#NI6P*<>YcHGiz*QiIq*Pe`27$~aU`5Y7j`yi>6tQP%q3QZYR$(b;wS7#T z<$UH#_hW_QX5j_^)R=>vC>Uw>H}XN9qFSu^2X)a z72Fqm)8y>o>k^;P^>Kth@h#AH@Vq9&+0Nm~N_(ydcxd#38i_VsV2FP+*s1mxg7%hiZ%s8kE@CNsM}*! z5LWNd6E3Cx;q56vGX6yvH^$tt5L|ydJ}x2|6?zOvL)&8!mRDcLSCpm-gNLTDG^ZL^ zok3fhFm;afEI4rI=hL4GZTtHa-$2-g)2If8H_)9z6bLQy56tGdokAH1akZy+a& oQ^*6|!~gSzgK!-G`>7|8Sa~Vj+tln#;K3kGHC@$eR literal 0 HcmV?d00001 diff --git a/example/ck_tile/15_fused_moe/misc/moe-1.png b/example/ck_tile/15_fused_moe/misc/moe-1.png new file mode 100644 index 0000000000000000000000000000000000000000..91a1f2d9dde2eb892ab621bb1fdaa9e1f7f23a8a GIT binary patch literal 92535 zcmce;WmsEV*Dg#;fffq1xRwIN-5px2xVt;W9fDI@pg?eUDDLjT3Iu`{cMa~ros&M# z-uvC>T<`by%dgC2Wvw~K9CD9)jG0g+h0o}yM5qV|2s3N3$7=BR03loFIT-Dqk(y#B;Mw1!fW21)EPoFWl!Tr^k^UK#&=`UK8GO8W0mT1j zRFoM2`?Ie9zNWBQ4T$1exW*>+|Nf{G(4=_6?S}sanSdqYpPL2{ArVwo1i;1#{+V5o zasX@(_5c5i#*9EG(*K=p_S-iQgt`|;XKqNqB=Fx0Q^SOdiRtT|kNz`gKHFY9;hO1x zFaOB$pD%QM|MSJ)BAP;EHTWpIs$rM09;KC(@_}3;V2_Zm{ z>R#Ai{BO_u>$bS;2F2p1%%RVYg%Q+z&Da}M?WjkGCLjJxf&T5EM|A@J^K-QxfO)Sc zQlBQ}u{mo>fiNZl0w+a+)_M9b_X^WJzdzR}yd+@BrZNJxJO|5~(p!LTf(mE8!nY4S zT4tzK-bi4I+f4+1S%3_Dhks+S8vMO3B}(`_5fF-=5exv9zyLPB%`(FSPU_bA^kPoV0MEPwE?HAO;0YKKR1sUZ>p_X z7etOT-;4ZB^3Nk?3d(aOC~d4|#^UZlX(_04=_oVnS}w(Oo^09@QD;E+nn+(kfWp0V zthCJj?H``p6HoQ@O7Yx_EKYL<-qfl3D^>_$k}*Wt zHJsG4UDb;ghJF^$9SN@$_hd^M7Y%LEM09&x>f+g1M5-T#5r)=E1YNnv#-c*vHu+DK zw#QP!T*Ud3#&NAVjMQ;afSP~lxl-@g0KojaN!T;VPlE%OsTPUD6p3_)0>hjS2PU3L zX7iW-@f+F5MsJ{04$otftB3h!UqSdeQSDcnqlHAZr}Lw}oHnu+h}NR4G)=%3)}> zze^MEF{(}`>GdZvH&K>ID|$cqE)k8up?+&_`zf^r?lqGGkV|#w^3)aq#Xq%fTL_Br z-?ag4PY#bX|E!=%=B(F3IFXlw>$eR0bIO=Fyg_JOo**=}cC>?vxPE^}Zu%P$Rz4cx zsS*tt&v*4`;JFGSs7aGQ<2~19xSUojgJjz}A6mZ=%M=qX9!%xc9-`X~Gd;3< zK=j}K(iLPNe$jfoDd>Cr;ek0GXL8pTt4NQVIs4D+brErQB6fA;M`x4vr(=Z*er1dc6gC?$y?B=MnmXJzkd}_Ny-rml{m#(II{U(RbLfepd!u$k z)y!qmG>BPwU%6jW%2S^IjHvFdP#d`yiT-1e1M9LiI^3mA%F%ZNiceGU<0Rsx4*3P3 zA$&>8+VyqaG16wiDTOMIx)*f z5;rK&LmQi-Wn^x{rkYXa#!smd%u>^%jxcYRd2?#JLszhf7{lLQsrRnRoSj!MUX_}j z%Vr{Le@PU;BXfP7s=eQ?xAO5Pt)fP9L|XJ#8}@Q&EdHm9-1Q>oVxM~8m3%f%ygUWH z4bt%vm9NOl6sAhUMLNS@yx)Q9EzV< zXUx#qisfEScUe{FO41YpT!Y|ShG0#dU<{gk^X2w&Mhc=AJVi~jZW2ZHtzu)xq@+tF zLd_ULyZ77tl+GEQUJcUP?NEfR0Vyf!jxC-57T1Oonu~_FmISbtzi55$SL4(ysCic8mArobPh8Co%N+xlvj@JZL*1N0dm{|`02E*$+hpKT zF$={?{6@1gbB+FVaP!^8bh6T2yvdxLksCB$1H zyc3Jp0U9ly`eAOY>*H6-DKHGWd!eQXsef44#2mOpv49M#(yswHH*Sz>84oukR5C-~ z>Bl7CvCT5pM&@d5O-qUMWuk|!`@nxwZW_B$D3yaW?ERjcg}(!ru;zUgS>IIa2AG_X ze2r)KSA*N9rT3>CK~WV_%haqlTNa%nx61f4LV?VJFELfSr7F zYIuFr5;PdR;Xcy_9+I;yR0zSc{N#K_$wXE?M%wvci{j_f)7gS|@kFB^B9i7Q-i^xr zs@^nd&W|*pw(O?QsvO{IBp?&*LM2Vy2rKsN|1>$t%b}ffdSZ2|p>3?genC=9{srzz z-ux8@;Lt#wL>?U8?#ZhcZjY_SJ+j`@Iu6vNO_QTf%oDUE*FWHogsg=dELu3<2(C=W z*5Ol)y_L)C1WkNNEU%?~Y>=Z^=lLRwvxR4P|Ct&u1qpMr>4u^-J^{zkh9^-|JzW#v zg%~$n|1u@)UiEsF!V9&TSy_cv+H;_MqNnWeqo@{z=UKAd@rZjFl|qZA^FHswz%r=s z1GVORY&_`+!=I0iZK-I->3F?;sUD<*?j_xC(8f?uUNuNWLB@x8sH$m(1fqmFC zTsma1mo?U_<}Vsks}T))NGTfAM8a1mz*7irLf1<8^cOPtkDKNsV(H?nZP?+$wIWZE z+_1#UnCo$vM=H*}o!f(cbra&D?QvEn|O39T*@y-Wd7|1^>6(pu^i=?kBWi z^FJ16K??wiEB)gq=z)oS%tsRvEwk-4UdedDh5@w;ogv5xpvsvbAffYem5z6;LkHKS_>PxGTb9K~7QN)$Fqgv&BkGDhzhwsru-5gnL5ZDT?r_S>)H)~S;ipN^CL;n_5 zeD)PrFTO>YbxsJby|n+g_A(?mTJvzD5e2fxgEPhPbM94ek_J#~hFT*n1!fW|XZ0HD zqXkJA89x!xog8;vSV5wHm;qiD)$}HYv)GCm3^<{xK3 zZ;*1)bYt)H&buV2b%o!(qGO9=sM@&*^?4}aAU3DS5qw8{OUFL-OV!`GO&2xw>WxSg zU9)c>WUDG;KYQfIx@odSgRQrw^Z-YuN9nk?a=)!|Bm9Be({=+k<0+7}x2Mv-7#rGJ zIT@OFh79j5^lu^rLNiwwA9!LRW~e$BTel6&ZNzk5Uf&>QKv%fex_{4R;&IA`1viqRONByJ=1;OGCwK1QHZV~9LIr5jEkWt$+xhsDpikLy_Y zRk0}~;+y!9F0y8)`$>GLOmux}J!ucjmKH=zMMgY?U3Iv)F$MDKcNh!bU;4SRXE0WZ z!dTG>8YE~?&4^74>6LccIt-zx#1n3!1^IsxR@7Q%?W(z|+u_qry=&mL&XuEBo3^u< z_M>H=T9*|Vp2ye#aMDKb(JQos2ou?&V4m4y#|*<=dyo@;^mj@RxTn`jyo7tBq8}2+ zgtWx`iFq`qKF?p+-G&>26Z8a(ru|+vR`~Ch=l=XI&>s>rn`^!45^4x$yTRKMqOP?| z4;y`mu%`9LVnGBprvAaae?-cnPCZy22GYzMnaM}yz?I}Gty9`7vIt6MEg2}uC%wV3 z3Z1NOpRCU=MuL!jBgAnll&nff0>KVhUzQZx$4_$^Vw0|#4ITwMFn)gbmo<%q5u=lmY4$CyKY@Ncs_DBNB(l^(IuYgc0jiWb)P2)!{>_Z1AGMM>mr8V@7-m)i{Zw9Do zq-I{X6Tj%#%d_SIf<|%NCfTur8%`jr>Gs_{3Z3LPkg69B5DFllE z$U=wnw=lI@buEWbJ?*fTOLlhMJ|z!la`O_$*ciE>a$tXX6wb7xq#_??nFJ~mc|2;I zta!>PRi%~>-b!~m$*ZC3SzPrI6^nR@s>9Z1+RC-Eh;VByn=twlILuP|H_Y0LNQ#c4 z!bpjH#4}uE_Psjo5PZ>9&%Hj&!%X=Q0J zaYII_c8FtQ5R-yWR`kUVh#^&9*;PMOJ)|x!f`p zm!)XT9OnSuKn)vpkl@m~Orc_6Vidff5u=b)q|oLdGqUHH!V=-z78YGyXha^&4&ZBa-WtmhDyGX_X-`0)%O8=v2|lfeW; z^s9D239+1LT357&o{5=5%LQY%2=%5oM>DT7*5_5+u$gXEbs)Q%-H;gRhq7npZFyjw zvD#Qcws9lt6}1Yk{0FRoNtZ$lG@k#*T_}r`XoHQMct_mEVm=7HlHUHjkBj|6)o7%_ z#jyDanLk&Tnz`3%GIEV_P#`Lfy?7>qVFDQe;y?WO{NesQYt zGP8nCEcHOF{1nJ*(NiaEm3T6`>ia6*tbXY@2d58JCJU{DJw>(ycjy3Zl7?KGxM7KU z$oJmY+)2>&pn-wAA=)$d%oHCqDm*l|V5%x{d*D4u0#jOu9{uqB33e+mn#a;A3M&Ri zA@Nx#X@Gc&qI0l_9(?y@S1U84^tjHv;4t{rrxC;GH#5$j@y4Y{aBN@=90Mkm#O!VL z#VX;et6Z|})uI+HntvAab32OgvN^JmNG(Z3!WA7>i%!Sy!;j8UV@gue;JtEADlaio zM}F~N*D4pmZcwe3nyI^E?C`+PF}gfHGypV&@xKyHjLkTJe?Z#cul=d++whH{d zG9X3V!3HE09yK?|<#)V8quW;+QE+MSbsuRnF0$BudLZ3cYFiMJOI*^k1hp^yu5O_o z2+Mxz7}eA8)Yeve6*4dF>2iGcK2BPsBKiPx4=AUgtq^R&?dhU&Q)|AINbW{MJQZAG z4C0~YKHCw>x%|yCMMeMpV+^?^SbW`-{q5$}yIR)G$;80x)*a*S(U}%k!eHm0rtBs@ zo&g*RGrtKp2b}iJx&&6vMdjG#?;x+2~(q)RDVX{ z%|3j`{!UuBo)7p;Y!OaLDMhfQ!c=f&jZ;h6HJ)raQ~{9MzQxsf31!ls@sl^~UN$WTVZ+R3!GScZs}jgR$K z>(J>U}vevdTn^md?#Jw37v^ zKYl`Y;lY=zFt>x3LCIN=;HNqdB`^DD zSd+i7$m1YGO!RM0niQt-2yY~)M5_yP(Zh>*MGMqEJTkAB1qJYcd~1W>jZ79+h-v^}4{qd794k4gT#Ux*QpRkKm1_bwzZIkQdp z6dcL&i{mU@sb0>rGx1}2c3?e;oiaM%O;AOHhJ|3o>F$pYPw!3Z3s#NU?y*=$YS3rY()G_Z@f+> zxhyj(m%<^{4as(8Odl{;e(WI?F|E$iD9WK3NKhrC=%wPq4Y7pWOEp)+R45^8Vv<~} z1Lp+rZ@Iy9q=aYxO-7Tg9YB>FF5AaIar~nEfU9-8>8tJ*$~e6y8td)twJZm| zV7)O2czvpM@#yg)T2fegr)Rh+Ni${Uc?r`Vkmk|?Qc4ue* ziAJPy^bR>n=k3?{i__emj?un`6LW8`qtdeqx@?_L4)}wCKdC{9;7~V+eG^L*AzKk@qJGUutS( z0E{S@-Uzf;P}RQN@qCv`fyP|L_%&;Bla@w8{@cM1i7GW%ZbHhMd~=HI&fz(dInp-6 zzK#hbqrla<*4SVm|0>s>?&e44DL-^HIB-=n?F4)sK={ik*7Gc<_>*3^xg--3o$p+< zS`lNCUR#{^hP-Mt{J=s?Jnz3NVt94@ySaLlXF(C~Rc|TvrNnLrS*x?_GOy&EkKBg{ z6S}U=ssj(lc29FPFa3m^ z&r8}!ANCY>u&TBuFKiSI?NZr_s>FHyhTAF6z;iwX^75ZM0qA0I7TIl3gFM8i&KB$R zqqBH*@Q|922Eyp^Fv#=gVsAAkV|I{RVNtc-_`=l1_r2wT1Wj;_`s+Wo zJ^gPT-LnHVOHfg2ikGDwYW*FQYgJgHBd^vs96@;Wqq}>?%W6X4Fb!#;q&5qTerS*$e04+T0hj(D*Rzrm0_Z zhv{wT=lklrzPCbYQ%(!ckC$hYLuA1oWS0xfA6i66n^*;ltd0}Wl{42mj+c4uRx!Ot zgX*m(S*nynl?1&mmU}6^g=8VTZrgNw@8|>HaaSC+2%lO=`mK3%x8#;=*@}ct&ZJV5 zQiQq=mHgNxKji1RlspphGVi$@4WSJ8%S_}e^e-Iik1oV0JNy!U-z;sHq)e(SE~21A z=H{|cW-N#i;a0tK?!{^#WX%4ktJi``P#H{D#!DaGhF@3|Cl-WTQnYwefWNXFM50sn z41H`XPhS@DcJMmDx@t1Nd~z61Be|(mz7@_3<0F)?8G$~U|HWGtco92D`i=GDdQU4H z9dDX+9Q1G2rQfS#-$m#&Z>6=c-qCGFY8R(QEse^Ye^CsSch|0#PB-94+?sC?O87#>2(g?X2)83f*NM z?@6HiXK$`6xJ}+WJvI9pMoIdIRU#u@43C%u=ik#MJ}9Tm<|EH)=3_)>ZX*fdGV8)07DR z4%W-yaONTN&&(1cp1b>7Z|*2Otw$S-%ZVcOYaM2`*Zecv?d_Nid+DhkhmZRzY}5FL z(nlrs?$kRl%J-*KibA3xVKNc}t%MmJPbS+5t-6*NaAl?`Tk0@!oUtq>OnOH}(I@${ z_2S-+jHj=$`Nl2dz@LoF{dA?UR6*~|seY86Q8F0hw3iLpzl_?O6_sJVG>BN>_09hN zKKP+w|H{euVv)pO*s&|U?~_Hv@AU?>>bEmTlpVeqG0@}9M^ix5*FkqiGGd~$`-`Q# z$rW%Ad0bXI?BsGZctJbWxA{WcNJz1za*Yui@1f8xfcb}^2xA*|??!yNzlAlF>-pqd z19IM?TUE*PTS7QKAWVz=Qq+*&=)Lh%8u@_6-btmgGe6huvCjnd6|GdkV1ISn?Idk1 z)rDX2`qhJPqg%`~Zwv%#cnzcI_1_gY2dH^GWUthN@AuR z%{Fsf%`(R^>5BPoD5`Ptv!j3!&DxUfK1fzM;`E79gEepAz@mu8HW3t;Zvv~Qsr+zi zr@2|pkM^um@gpU7_zg7&zVD7wP!X1m%O@}UoZNbeuRcpw!4b^}>IwQ<2PQVXv}jJc zuda7Ecs)BBMsPfhR_)@E{%(1>W04tNM%)Z!_B(g*U?9U(^$=(HzXO)R zZJ^$*whT49JGECa;p=xHOTW8Ai|bsl5Q@ao0gWHFr0RJL1nXu88YRqkn4#T)T7bC{ z`tVPZTp{;$n+hg-EsDsmt!CR1NA_l}a3V`{&Ca6TVuoK41O8gpyCL<2B zN|CxRN_+0JRB4LoZ;05Sg%oiG4|q8phpx4??G0uvm-%%(2JzSrT&ofo|L;wogLPes z)SyDd_#F(l%L$)D;c)J}>RZ;#4&%&;=e zTCv~iOxJ2{J#JATxKwE8w)y1!GehTj;>s?67dS0T53fJ}Q@Dk{Dd$tmgi+9?);{2z zO6OHxERkdtnlH=CeZs`Y*uB-(@(!w@g>56^M}Mk6Ic+-SedWzkFKTWxrQ)HUDA7Xv z*!OM{6>A=E`Sewal<(3(nNw??!_d5s?QuP?^{1ywje<>_gIkKlsxGE8<@N<39L84pJJab8A*((df@V9U9S1zYJ}X&qhyjrwSX$~{mwGVeiQn&Ex+co z#>8_t3%cB@>XEsoiPiQkhlv=DA7z*D!+Zpk_z5lg_;3xh%%QzWZ}_tfmnit$@L-DxC%Y*MQHzpBL1{96ji8cL)!#9Qx*;JSzp2zU8-nschiZq ziNM(K)|#JL7S|Lg%9&tY-sGIu`qnVww?(+VIrJj)ks&s20*yvRb5UY>{BFC0knHoU zqqdpw8H(GO>2pgri3}KqK|A&?iF0$L->JY?flxXrU)*InJv_o)QqLhnDe&x=z)z)| zK$vxI>uav=F__|qoz=6d%RDpofr=OB_$*=)j}+f36nS(RTpdqWXa;hzaDe!u2NEdo zFyLl=;C7>oMIZET-B8-`oh_(H$2C>M!e_Fv2LE&WYXZ+rC?SZJ_%^&kT`~nrbzusV z)gYVwe>ggXse}?M(2b(QN(R1#9JK9Bz7n3%ZirPXuS~zYrOK5970>D1JCC;*zaLrL z?H5Rw3J*#pOt1j)x)x-kzE}5PE&(CuA{7-;hL0V2BMF6nd>Y^cl65oEhJ4k zaiKi0h|}P}ZL*&a+MV!m>2=EDP|9sL3Pxe!CXP}i;a>gMUKQiX0REFvs!2mmrLg>f zsO3-AS+InU?ua~}(Vxe){nW@e0$sl-Zu>e$7A0~{8Z9trr#vbrT0a&!Za9@N#pD0dXZ3>Q)6j@ zL$SA~@$@UlyX0By3^L!A-VF~7%sC~kM{P4 zdhXVU2&u(BB$~DPdf0CMYCjx&x+*)&W@BS()EBHnO;AG`k8^2pJ#sa^q8^@W$ZX++ zwI&>myk-t(bxW!bfb_E%7~>;3>c$#KJs;jUrqM*9g#zQ3z;VL zaJd9a`9$n-`W%XPE3Yc0_}c21R<4Reck!=;^trIzqAa(=_QWx1E-mjKgC(PkKi=GF0^ zJ_nY>9kJ7zj#amt*hpwXYvW(LegvCBZpXaCe2FVFB+KcU8doPf6pP!Y761t%7y~oJ za2(}^0OfiQ@6|vCYLZg68+sl(t!LJE)XXtL4cGLTw@1FwsCL|}ZG;8o9O}fUbu4AS zqJZrahy03}OWwK|N4GfpEo46uF>E?um2=vnIt9eHV#3llpHidC+UrD?jg)I= zt4{fR{OVeZYLEVN#wUN=9%Azm5U$lG^uK(Z+I=t|JgKiscCg)09v(2r+b`N8B)6-6 z`a z4W-*5F!Z|~=pxss(Q#g_mf<9*Jk77LSK}aH`O^)B4sS4mH+iX}urKc`#$GyHFMR+H zLTgNqMN-Z;MGqr189lMRZ9nJ(8=XcoDir34j&oT?uA3*i0W*5`=Tj+0ZeZ`#t#4d3 z6^8e_*wyj8u4l$G1;WhSi3u|qB^Dq8+k}1^2+oH@S_xUGI_uORVtN9!QB7&HIA)&-Xf>5HK-AVTNfVhes(zd z`+OC<+K@6m#9yEG-lS(W2rz&r{BQo#V^WG?3%Z+rvY>!&_AL7g-jqu46CJ<2B{*h2 zjJR6Ewjr2VTAkN>L4}<6$8P)63i0LRWkOA>6-9NqS6%qLE0?a$T-(oCm1$$X zquk9f6WiXJNs&NRmtK5nSXIO!LDLeJV_L!!fzu4Gs1(`v8xr+?HIEP zEPyXB8`nc-=8v$}_z}3rUqaYMdL4q5FkXtG_o6`JM! z-%x1sLGl|jF!`EiM>f`0a}FS~;9ORvrlmyD(~9k8dmvQm@#dlr-ue{qheY{XrmV{Y zd8ZLGA8IS>IhOm6sdMg1si9JjQ##v8P zvN|B9pV{r*dv2QMrqe7#u5TR66nZ+}JNfqbI$_2y!k(o1^Np^T_xoNdT^h00yNZ=Y zoD;}Y?IfW9xAC4pg%PMIZs&mW=_S0S^EbQ^5b;k_UngKHrwb4h8AhcN-)~X5 zS|Xtnzy{BGyc);D$_|lwpErn$!h4IFSMax=?yUu3tzO75mUI>6I}wrBt2>xqC@){7Gs z7kj+3TH;H<+6GpKi5^XQuB!7un8`Vt0d=m@5T*#nZC@v?!+x0}Go~(%V?nVgQTNMm zp?!N0Jn8erb68?``4Q4Kd=?K3-N!h1@S4or@!qkoQ8m4>e53EZqv7Cu)!B1?P+#Z} zA4Vm}y+3`5)WgdBF*Ul?p6dSZeBDF~ z3w<3fWA@EmIqJA!6PcHRI7S%62^LuFDO#rnGg^$)GG>Vq!BpmRv*zB9KWko)Q+D^# zN8odP@_EU~pfnMI-}d3VVBH|2z&ZF#Z3ZDLDXd>KTZJhUb(sX%AnDH)bo0%WuED=Ep_{z4^Qi@jhG zaNZn%ihpQn((GQ~QT?re1eJ(vxzZs~@nD4XFF+^QGYc(#BC@Alg2VZ%;Hls{uk6#! zDcm^!&S$r9p8n&1`-!)<&Sn&2C*VkrXZy@C3>lhH;|gswMd2*jkBPpG>@)V}ofGm- zurRkzOwo+a%-Dyn>C6mEYiMlkmc}<~H@j=#4ew1Elh2v6yPTqt(NLlFn#A`lbmLB4 zv8%UBn%f)4FKHzz(UK8)kXcv_3a;D{tjG@kvRL*7?3Q@g?R zBF`^EQ2qD;4W6E}-k`G)ILM22>kQW+W`&=E?Ai5fPtg`O$Q%RO{|PkVd|vyo=uH6u z&Juxq%QU~Q5scR$Ws$O$TVCaFFz)j0=JPC51^4L*WYCc83PN$E^ZZGonDN5ecUN|2x|1xfj=z{wUkk4f zS^5TMlrzQFo@!(^-MV!|+PiCb#HsLPwd&Jj-L>$Iyhg9VIkE4MS_V)DkD7bf=p4I? zvU?@y9hUhCxmr0#P1lpys=1>88@G&3iA`9SQzhNl4M>79w*ne4 zj2(1>%4dI$C2-9F{P}#WE+94$f*X{gF5$9<<{MSXaxvmM^ocKSrw3%cRrT~hHp7+k z1jnQPWOnPSt%{D}>ngA(?qjqsYL%W3EiTgsWrlqjgBBbE7KaMNv)#jEmp%vGA+H>w zEktB|pxc`_hqHeE?|3cEnF*!@n$&2?b#G1(9zB0vrQC};<_sFjs2}E=f%<{PS%=-zfSp;mX@G*75okBB6t3)Ha-5s()tx;~AxD1~3U` zyvJgYi&oe)r(J(?xie9c^Q%lh4uQ<^PbmLSLQ)I?-%UKC1?A^=M81;f>vIqx1o?yz zMlJMDSotIcoUCqf_%Gl*(0uJPnOz~>&ac6{nVt24% zerY%LW=RTZy0m^@?mYQal50h-+(mfGrw85`@D-Cc zo9A!ibA%2vFyGke5%3GUo~?PjwX_UwAfB&C`If*ZHZh`ig=FUdTu}d0%i2^axQsd=2+M> zp~h&ja|3vQ`<8}y(%q0X=P9+#G6oj8KT)jrC(c+n2<+}3v6hITjk~eAavB7Af^=DW zMpkg=SbiVJy|e2VdR z%jW!Bsx`~2R`DRabN*x<$+}uS#?_+p^*fW}EL4=<(6w5s{B?eFj-Vl=dqrf|Ib2Wu z&TWhx99pcWLPL6ZyA?znt2kg#SvUzEV&P79Y_1+wE-_pR^~@-3we4244&&O~S@`~(1^er}DRq5d z(q0Pz>oPsY;v2+|T*$(!0n7gr?;gRslWDn#qSEx!52q) z@)kls%{e!Cy-=N`i5xk=`n6BzKPbFFC4xtQN0An5Edd~p)WS`AU<)v}p`^=hgt1p` zTF*S&OrMX3Bh)W8AoUj!;Ja&cXQ;BFu1;zi?8eK_bigcyP&>Dq|5%GtHSK_(u#$hK zHvp#L1ox}(3gyW&lyhX@44nHKPnO|Y_KOn{ak)uNHcHz&n6;%&l1`s{(7e-l92nK; z0Wc7=8pye^FBev-5JlZuFBf8?e0R+mnVY_(GG*<>g8WEv5qq9$tIsNyMxjGgX)(h^ zl!(}F;fz#pi0J#3*aY58(uqRy8-ADnW3RJ5pjX6&TE|MliI@O`N=u z4$5$+8Q-I&_n9xY;ET9V3&N=%nGY-c?5WJ)1x~ZpS}`%Eh^yX%3E`ugKm3>##MtFLoTPYGxt6!__WfpwczAAti=5j7OpFSU ze)hxG>;)?}Gb#Cf&n8DclAyvbmLP(IHoP32!;m~OBM@$3AWu)OWW;=%cVn^QuM;2% zpYu$Ge!-ChF4D3q5xhM14}$o!R(yolXz~#=FX8r)j=KCN&3)!v2I)v~@|gpF1pEl# z-V?D9&|K zF!2(8RPpbC46jnb%Ti{zI5TP0wiaK0D^hw9b-2O%IpEKQH(g!Wbv=YyugVv`H3I$P ztW+N&JZ2x0T~KryzZBe`vxG2P;@~`DfOSp#nQ1TY?xS|e?K;S&m#)jd!Ajt*$?F6w zfFz~`zqMb7@H0DPWTPn@AxJ1ZGX?P#r`Y&cK1YjO1H1dv!)KS5zblxUM3mv@n*P1U zGAQR-UjVR zQ4J9+OZhL-p@0HB_y#XAUGFXn#%P2Jlul&TLU0`Z z%y9AM^o*|E&00ENWzU5X&*dosML%saO{6KD?iM*2exj0fc`qy`!lNtDO+IZJ4KryJ zL#x5|Zn3#EHET$hrlzLBb`A>%2Zw~D%!5iD z56*1pg_UEBU}7~xl~X6 z@M$EY-2rrfyBD&iJd9$WxinLd-*GwtuN}Krv1f5)$NiaWK%hy^@z7>gaDy&qUK&gn z1Wg<*G%jgc3=@7aJYD3V$6Q3VC1Jo;SwMt9bdsGipTm zk=VacP`nL@gPU~D`LO{Wiep|qe1rTfDg#EpJ=J0khio{{@_vgqnoow{VbqhX^a9Nf z+TILkH(Otd&}7&#b^&aj-}L6Qa5%$@liv~#N$@6Jy($}gnEU#sx{`en{#s5b%zhED zbFvn$tju$HpdAQp)8WkPxL?hg)Ip9Tv<6;Hm+SKJ@wIuiWes5w5*FtK4{$FNa#)lZ zb}WyKRJiR=Yiny)S6A;`ulI&OK0e}B>V?ms0F$`kMMy8i|B?i9&0z1N$MBHKl^ShE z+?1aDUzR+C-vz;g(K2JhoPD{^Wc=W@=7(|89PwXfHFkj56kogG29DB0vwKz+0+y4^ z^Tl6Py}E{$TkHdVF)?p?%9an?$(Fm5eOa4fI+w}G;06&N=wkqPgwR2Ggy&HzyvWQ; z&1T?zSHZ`3;|@=c3o*4pTBSBB4mVxU6lF2EnN8IEkhyvF>Q!i1m`c0Vr(ZW45Ny}g zqay`L?Su&Yd`|1h{;stSvO%RV(c+#SabjZPcofpSf&v|F?P5+XJwE4S{Z{uiRHv}0 zsL^nI7FS>2a|B?L`@2 z;936h;|F}>Ha0d^L(aBG4M6}lF$h4p;Vq@4_~*}1t2QSmr*9~j3Q|%ZBf}ooRr2~8 ztf#CYu~ewIDz7jxS6V$-=%q-(z})t$qa_Z}Wc>eUx#ne4_l#7Od&_4rq`DXxr=A)k zXLriEyI(*w6%TL_6t=h8;1rEAlK)Mh@dD;Z$8&sST4+d9$R1kJIUNW8z@ zw{+P8P8DmuW&+|1SbzNZ(abFG+qZ6T1aYB!S};?RH3+WQ?d|R5x{YAO9BgcC<+tA6 zcge{FYGEan*S#ddKD^up@#xKUr*^Tvgs5wJj8nwiw#oc%C4R?2%$1k~EO!5jP|158 zWR>k;0oS!KL}kq;ysd(rW|H&jSR7n3{pKP=$D60jdP3e?jdS_n{FGv^QjVbB3~$%A zrfn9}hMT+UonwlTY3C~8ddHuQV_c)FfAYt(a7+U=p}tJU?)%XY__0bzw9}mPGitLM zlbl(w-ig(*J6eb<*WYp|fKgKyEVf+yNN-9@NZ8()Emw({SvK$s-NRWm|6&K?E%Q3v zfc*ab8*>=aK~hs&D-A_QMI{pP6O;*!it3|`PGZvK_BdLEJFE9IRHalAN&M{EOT5|_M%n%`y9*mN+#vJX8{r%I)K9Ot?gQaHN4qHSwtEGSMQI9>1S!5f$!tNj zG`BH#6Ko4VX)Ea^7fuYwPtA)Z48lx2B`3wm|Gk5pQSz~xgR*0`Yh18Qy<2+ed75`~ zt+u3rio#pg{i#Km8E4;TwHEd0nP|EDe2+Sk+A>ceVg}HtXLARn#MO6@{1RX z?Y?QLfW67$sw!62Wzu*)Xv+t&0k-kVK`RL zkejJ}0*XXwUY;62hPyWe4Qiw5Sb7s5@EojWvX3S+%XO`*lYmYLaF;0gZpvqGvVK%p zH#0gK7(1*#`UnF!mJ= zQD$wt#wrG(AV@5rARyAMh(mV`F@T5+(nt+3FbG2p>CiQB zkM4fo=dSL(_YZ)#&Uv0+J!cXNcedON_}%Z7`pXz7?tNTFe3)J$%jllGgx|GVpMz?& zJ$DoQa3^IPVkU*L3%AuN?q`VYj4Jn^7Rv2w?nx6XDk=i#x88HjSiDFFI(rLL7)NyV z86u)s*j#tY)hk!xQd7hG%VzThjTgnui1RN=P}0$f`R;Glxi7tqQ%(_Pxp5=6a-jq2 zYuAxtO!8yF0pt-k3R2jtu>wX_3_azF)q4>jNjWIj1y^~{o9AI0|cB9eszMSRF5-6lRLqX)PRZQ z7IvQqD|Al4Z?b%Tc}UUT|7(fP{G|VJRb{0^-;_?;6KR^@c%TjAQu1D;Q6$=*?pD9;0g)RNGSDWX=HF<1dG{16zDy* z;>zu%LRV|X^zu4iMW;+UExvujx&`OikGu9V!~Y0M*xwRy8sEj`WT>mFlQM`uGBXP( zSKha3c`*_!B1~Aeb1pDZ4Sg}KJz1#I%eCD9@gO$cu}*kP4F|gZ;LmJefhemo=h z^iRx5l!hm`rSU+}cA`1gM3UYLInOl-b<>4klj2deM672Pt(O%kvEwBd!&X^!*CR|` zuiJ#|*n74U=~v7cyo(tf;e-;Ds5EX7oPBxXksKTa8Z1J6kVJ;MO0oS_Hi@zDg)cyq z&rF^2`>-?|J6}Suo1ST0?-rM3QiDJs>gwcP1TuP2%^Vf_RL28cXRQB(+fqpe-C$mo zsi&oac4Kk<)=QTD6uh&-y$26q;-qZ3cURaY^miPLGurk(dH==SfO&j!hi&nxNZOCM z3LIf0jG;J|bSV{dM!g~3D-^{x>@3d)$=LX%Q>~iX$493*TV4NBI;sbKF-&Eh1U*BZ zH#qS*vUBQ-&P^@>xbDp-m1X4>)qhBb<=^7uR_^VQ!7MGBa(r=BGxzYj9y34w!Q4j7 zz9)F(jRKZ3l*L}v$XTnmA~qxOL-z&W2$7uJ8yrm!qP$C#ZJY=+(6nxIl&2>(3O4lh zjkSt!!XR;(D#a-_UvE=93Lu?kc7tk%Bys5aNSVd)-aI2ex@=+O$ThLm!MJ}Ur{|ND z2U}z3TpZy>ro?llZSInXOf2u7D=Az`kF1WZKcloXG#uUrvp)SNS`Sd`UPM1Tpcs0e zqVhM27DQK)k~V}Q%q(sC)o*Mf1>ixI_UrP!!Y3uyWyA0LHLWM#$GTU;lN#&}YD#-B z4BQt*R?7qhI4*Y(i60ycrwJ8`)Wwg`(D)v=a?mMmq&|DAcAb7~7uuxMsb4n#y!NTd zEpokcznO%eqxf1Qby!8#@Kib)GWERKV>sb9trG289_2-Ejr&47M|*22y~e?31XIT< zlpnX$y}Q|ug7}MhfJQgq!!3up1^Hc=L7X$;fnqE9v8WU9_A`&H@iSTqDeM=IL~Iul zTUz>Ba#S+_oP9i)t7ZUqBFrY51d-B*RuoB4-*G0a($V=lEovAv@st-nn^w6fg6QZf zTARCFF0=8~)L)f|5`FmWPs05apYQyg&;DI3c%AHkFmV*SPOd108%=)IE_|H6Ol86`2e{IUlm?UYeT1l`|fR?wnMOxDA-du>zkbHE1NhnQekZyZO%&}_cXGV zq95F-_gK*f)EUsvNP4lk<>m157u`oJcbGZ62vNgx*86En)^|VG3>}E=Ln9VP3}H*= z-!>gkO~RcKmoM8?JBp$fLPWo&xgl) z2Q~jcL>;(7IvI#+vHkQrCab5}N1>x z+*eve73HZG6^?%xwXg1F7~AI}1!Zf2vIk#KymAsg1{TBD5*x^#`kpK1J>;iuhEsOkMMy z`N(rlWse@|5OKKf)0SYg4xcRa^91v_+s_Hq8Z0?=FS08ow{oK09j3BhrcHL!0^>nV zEjlTQCjDMSukv7S+_*vX+pg^W;?JB7ntLHH zUyf8erbwSPFffQK&OE#5hZytJ^-kb2@R<5=<_RM+^FIX}R=SdV3NK-6gs;I3HmFoY zGq#+rMl1q1_~c&jH)|=^jA3mW$DF!cn1>XTEDURMPU{+fcE{e+M<9Q)V6#gDwaT4* zo0^iHGPJ*9{SYB>d9NQOW$aqq zP1IICPZJ>dNIJxFu;XPX83hm`_RsDrFEWgL=%k-72|#*_Z?oUM)LLB=Ci4&~5*(*gXM$sfnCI5v9>rCj^t;MQlD_9w|A0FBS|Z?16CLgCxS@ip zJz1l|977P7S>gd4%|GNdP!%+(mP7BI6@tkvnceL+beLVwa+=LQz?Wz*-w6mu4UA3| z#2+gy44k1^dq(2Tx`_*o*hvvWH!+N&!h4U+HYDBp-uk6!pw?tD2)NJi0*}tofJEtN z>1q*ndh)Pl_REZ)Q8eDK-bn`%1%ic&Mzsnzq9gUHP{>p%iFfqieb7Ea=#%eE4}DtG zX;A7-bdLLzuwD=xbZuh2Cu5HaKkKR z6(hQG_?n%wGhw>*vYq*#B0xn2&!f-7ayx9-uU}VFQ|l1n0-#tmUHmT5>gp;6vv$0v z0jO+$t{U?qvi7EIrlz_1+lUB~jzCg%4UMA0Le16TFz*ilal}>Uqm|iz5U9YQCPSm2 zy^Y!2T^i;bCn?O^n10yt?{gb96>?ifbt_+UUqu?Ps!Td#M5;aKt+(u$v1v_yJ6+eW zcl#{;-bn2#yd97Rll60HFLhcGg*&(%E~W)uwGdBTF5>DUSMLvYvqfh@vt(14f;>p1 z`?ZSk3b2ppacMFl``M#FyxleSm5FJ8(zT{*jNa@w={_3Yy3$Tv-kdhZ`McCeTTAQh zd17P%={MRZjg5GDTfqF|A&}u5dQ!`P%9V2XiT|Kt7HDy>HR z{d@PiUh^@tu@On7O8FlFWs61u8CD)a&Br7vN~^jQ5f(-#4tXps{VXbqY(GqKim3re z3`6-tF)WH#ZoMma!6Iin6SJO~BI5hB-Rs%G0QwJ7wL3_S-yaRCYn4{>Qwzz8u)6D# zSNpWrtYRZHl=C+*y2Hg{2|Xj5FlYTh?=#_^hu>YbkV|_-Me#nKOq#+1C4bzc5*yPE z`)Dql645b3NtKdy>>{aOs0+i}od|~;rX!SU-R9NI$Hd%bhuz&*$NNjlbhs?i1*l6z z-$OVe^TI_qI7*aLTt-Lt@c7_PBZbh+`a{D8uM(Ny_72eqB4If>XHrt9;^M*tWEj2B zwQJY-`TLujPlha1+76V?0{4r9qZ5UK1MLrT&CT4L>6W>(Gb05>pu&sb;I)bRt2odr zMf~`J$rvLeu3E`(7p)7+8@(UB4Lq&CtL|FCqF`c4o<~~Y5+{}7{YwjX_5yq)*!L~Ic5izk8LCl*sZ}8Z*thn7>i1<6`Xt|Z^e9uIM4cQZ; za{Yb?U?Ro{NM^AtNg5yEl!B+gp@aYN#{SGPMgUm91;$9eF^0BXq~hXgx`g6~7H>N} z1+u-nvZqcRI!$6!l!=Ld4@(|4i~_GugI_?PeYDmR%rAI&cmN@FVXZanYCBEsZB;k7 z%KG|)!@Uh4rzw&e-RH3`F>d%IRQ4Xy4B;UC+a~+wAMk%ODF|eLN*KyYN;Hy(21f3Q znTdzjN+Lt+?vl(;zjv-JkXf=s+!il*75w|x@d}$>7R7{{-khR;u!g6n-En2Dwc(*I zQI`R?f9CHG2i|GYQ~Wdh(f3lDjc1wwNrl+h*r?PBtInX?WgdiOJc`EZm z*m>lE%P%e+5Wa&PNR$&Uv8S>)Z@*1fOojg2R7hew8I1BqazYGSnPI(5l5 z5Ljt>v+>hgv6SQ-&Yn!;)8n=FZ{Iq-awP08ady{r_KiXCw3-X1^F3+9!^8Hd;t3#J zrVke&p2Rma5UL%~&jJJ6)j{{Tc;SMAipqW5$P+DE9{2RppdRXPr-l9~WdNRa?$~Lz znFDTcAYwf&lpTICwzZ4rX5H}@JYI@w+jg>i{BY%LRQ=8uQ$O95RvV z*tcgyr#x{P8xXN`P?AluGyv1gb#-y032p5^ij`bCkGy!(Y>WV@%6Yc#y0s`lqNtT; zyOdRO50P6gZEk2dK3?nEuGXLDy$SMsyABaQ!fi z92KiLE+1KLT=;GGMNM5@YQ4}4)XuOf*(iUaDlql>;}Yic$EItDtd@7Io>w|m;BaSQ z$V6rN%Vg-Lhf?mAUmI()s)Wf^&pDW@g7r} z4E_l={z22>Jno|+++Wk_;}f36PWMWca1Zl+WT7eu2h+AS)yK{DNp+l}YpE#{4#9QP>S zGT8MHe@pPa#A_=8Gm-vhGcjKz!{N5zJEMX0qsHA#T?L=&*^!0FZnQ|D;v5a!YtqJd z48Feop6tA|uzhFQT}L{!-fKs3SNpAZjhZyWX7IlY z%5a^@`>enoP3%IuLK+LYG@yp8W+@iEF`tcOg#aF^`qea|`vp*i<@SjNX$p}s{a5l+kg3p8T4+ZD7<^DY{jnJ|xsZ*S;xWsj-!{S<@ z>t3r|DCM~j&9Bnl!A(26aYjT=WGNp0gQ>{j>T6Bs2XE7Zk@tXQd9F#x1{FUKY&ba`KO3gEP>zh}8Z#s&imy=mk>FU;p@hy;a+>Tdu~?8v%w@_gW@HXJkKGTKJ*5oD z@w=UCmTbkZukcQVNHM-%5V8GXzC^y=y>j|G1ksEPp<7*jenS{ zj$B#uy_aOb<6gL*A#pr;`gq&f)%u8Oi3fjio-+&~uWw|lwW8sD!IO1zUdl{7WR(PU zR-$Y*UtvKD0lodu?8xF@jg&~k&!my@ulTi4#UJYM$TwLB8oFf)dq9; z%Fp=I6}x%I>l#PB6QBKW?Qa&;Z6rlabdH_Ix-;CzKm_)PrX{WbdFL{9$$QZGspmxMFJ_6qm8E9~Akr7i4ru z-1Z(566_Q6nycrMS~;GtGn=pXoZzp3m%zVNfUUp~eKOXY~b3Ej+7wSH0ybE%|uy2LzU)MA74=MwaA&triaFCMZ&9 zEY4`pS#ocn&BV_W=O}^^@D~nQqGR-J+>lybT3)R^hQ>@;9Y{$W9nG_`?eB?in1=VX z9ki6F9M58Tyu1#;3cIsB_NI;hxZD}Lh+lPD^WCOAe;Db`;kT`h8LMZSFf`)7D2X8y zJRjcf5kjvYt>shK6&E=lw+wWz=y=LiY`=%31z#36=I@ma8y?e|0Ws#uhTll`SBzYn{Sr279^=xo^=>q0kF|0p!HAt!QoPBo3WrMq^{v0S^;iY&aQqov= zx6DYkSo58s-(0VWk+C1A)NyN;T!DtVHrg<89UszcI#TOuqp7J0Q0a7Rxc<-YRvk66 zzQA}CQ?-i0Q5>+{BC_|zUhVPya_sMGve+UtDV*K7kJ+L)Em~J!(Bycy$jcguqa;Cea2$nK$eJmhN1eyRqd_TOPdF8rgHa(P4rXtz#wHXV`zcKN{@+B;xq_ z+G+*(9xdG>!N^uZe}ce+SvC<75jHltgzML?J?$?_&3yOH^7C6+ zAY&@a%a_5MtKaL(8W#=VgdIjo-EjR=xvlF2&@4}fU%d%Zi%h)60Ri}DWeRdSJGICJ zTS!+nO6<6e$$OmtLx1=FNpud-rp5ZMz%>+4H0sCSJ9 z&UlLGNR{AiXp=|e8zr5IUw@fhlQ@$0mXlFL>&a*gw0bD4(NT-_WZc=J^=IrD7#Wki z^B!KGmR3EmlodS+hwTC zmw-dkyCYc|1nDcOiG#3hQ3H1$KYsk+(`<&61{M~lS7D8F6E%LNZV9uqvpXaZ*vaSz zhlcj{_klr&8Yc;dF(oA>okr=>FfcGMGCE7l6>@{`1c*rMdx+L=NDf{wV~+$wm-A+Y zv?a}bJ_jpO1;xc&+4%!a`nF$g^ib3~luP`*}cYpa7E zkL_YdJV!x60pO}Q9*)+~g{^()#vCjsr&Hw-;qpWFB|aN>utva1AOBe+Y6B4e$8Ea? zTak^|PQ16OQ_-sSU#AL3(NGtFdDZv9E-p=i$~vDRNm)S2c;&e0R=2#o4Cp#vXFRw= z({mRuhJ}Y)s_ScL$jDaeuS7G;~yu2TV~|0xG4i( zhpD6YkQaa({ij!fL0yT7Ra7rNm2!6B;;9pc*$3lI+`ejpJwpx0r^7v!1hgaIplm-` zujr5K@8^j?Qc*^T00PXxp;#A`Af9j}EG+y48xb0+UtvuG)6~)`WACN`0i(P;@9pL5 zsM(xZBpod+2qnlX3UypdY$R->VQh>ve=kgf{Fs2=&E6Bol!UqgTw`q`K)iec2 zu{bYBLF0t}zP|U4vGZm~I%?`ig0whiHMPqo(Vgp8u3jY~BFcfmW@l&dd7b5Z?ZXj? zZi)PHiHT8PtF>sUOCJ7!_Q2HY30}JZe57q^2Sib^$-zO~($z?qhUa2$$NJavy{hJH$f*anuDVr+Vk z9zML3lE7)y#?i@oaI-$C8Vm<1EBp8$vfOmAN@Tvoz}x*#s|&J3TutFw_P>n0`1Hi4 zo9atwmwUgP9K3~?7n6A!TV;s4hV0ya^EMDr--Eq9$H_(s9-gRfKH-U4g@U2EnHiAx z*-)nwaS9I&%`Yl~xmnj*N9YLKCQ*veM%-wCPk7KNB3A2`?yLKz&1XiP1mg{Vf%>k8 z;W`6DpY;fN*(GrQ+G{6F1qV<|lD<8+o&A5jz803EISKPMy5u{Q=k3w?2D;v`ojCFP#cmbnUM&L0vZ_r4Q_&EHNuwFtnIrbUZj!o!V%N#iL&?JGzN z1&w0IRHes+B*;3g$G<&hFG{j5oYbM<2UPga2e5e_0e|<-{PWu?56W8gX!1RI6+dAP zd*??BgGmB8pW8lsh#Kf;7)9I}t^Q6~!gav;0DtgN&lh~dM|~70PDGvo2_(?QEG!t= zRw@fV#fL5@Wc*o`Sy(0#lWL=3gM+CXJ3BN1)SyoM5)%CH!Job{gt%5@rJU%(dLL(B zE_Qhi^4AAN^XX_UBO|d>NHN#BhRvNX%~FRe&T`2q2DshEgVhV?lgwR>2-w*1@$uBu z-}Qo?K0ToTe-acE6C)c&n zhi`l@x(nw|oVyV2Imka3q2tQZB{q&U&~W7zbkn={(V^VFXxMqK%N7PRQQxm$YrI5H z?_^-4L*r~~oA>$A;L;YS>fO5m@dCsJ030EysHi}hTyPT|&R<+yeDx~&VD!ab?yr#5~@yuom}i_6jx!~z$%;4bQ^z?Kz@y1_W z?}vqOcM2)C?g!(xu%<6Lo$+Zz*mzV@i-x9NUBl!Sx8JDOYm^7JySrOiMMao4V%b7p ze+=YEgjkV^quNFKtqqms$_Vc^0Jc+H!WICA1DPO+_*$o7s=7GFkH^q>U@m%iZ%a_lbIU}0JEhgp-AW_#7 z7}B1X`oo70Ow7!q_ZE_qX>Yw9vWxotjx_9DM~9QOkB_*3amRgbN88u;hE~wnI9Xi9VjeR+*r!?mzO#LI?4u5*I8F#o(o&|c{1$VRn6 z*-J@HO$8!xMn;nxPCY@ILD3&1H*4|tWo3)>DuUGW^YcgItcA&v>~D#2iC{q7EtXAH zKVR*_kgVkI)YPxz;w&Xd3q}8G@206G-w@ed)f0a_R3cp9@Rp-a8vOuq%>CrFcjxh! zmYN!q#Vt>Vkyl^lMa&MiMb_6zJKi>GFVTL4=qbp^oHj}8+~CEs&|9hNYifSfnJE_s z4Zpd$+1Az;3WfSC@=qu=829dcEO$aA2Rz1O8ob*xh|&)K3om}WzA1u~2njY-0nIB6 z30*n>7%=SnKbj)wP{jH5v|e>Sgn%cGU1yq022D?R_%F66B#v*ExjLpZeE?JCmbGDJ5Q!3CblJPC9fL7 zhU{FQJb9ay1x_fI>~4Q zU9dRu_tN^?8RXJqQ^fZ4<2|#KTb<+_92^=N8hf0xU0q64zOc=&UmuU6Lms$wc65w@ zz+W+oh7}CB;W4jY=hBRXhdSd5aTzhb*SNVO6Q>{U2a(X_7Zg~v$`Q~1`=$6hX4_18 zzY+=@djDAL^sEFYpKoe8Iu>nBw|Mw}?fVXqe%!ACcrPwQ27RUf?%bOsBLqRhIl_e` zS-7XApd_aNp4W)o*zudA@awkdcT1F+5Yd^(D`teGZ{q~pE^}hyr?PiN5qWJEB`26_nbOHVR zlq{on%2K_GmYcQ_#w|xmmp0{g&yW$V-j+V3$>1xATi@2!XGPdrjdIFLm1(W@dp7Ts z#R;3P>kZ2p4;*fL<~cM8W_v-i@s!6m=vPEN-(oAIZM{c^)ZFRi$h>>>7pA510Gu4p zKKx^NnW3e&TIP;*$;)~d6B}>zq=))x$VJBxG6V3)JghaMl#qp=3I=g7KITGXhQG-9 zBF=^lUHP`e;8DIE^tS4@dxpdubRdnPB1?#TeiNa!#^-7}K04Y{nKYo`4-8BD<&kA2 z3q6M7>>Dk#WIV&X$w0Q>upv-p@4dtj9!#QLtf7 z;qKmQ+_;XC9}r@_=p^6TaJ6K3gqKT?mc?nJNjc%AUUzurr{2T8g3ivU+4-g#H=H!x zt#SiDetr*FcH=wLq}n$*EQrL&kN9j0pjcUN--1FwnPE8yl+|C{U%b=WAZnnST>jB$ zmt|h#I&X=!X&ZK%Z2N)9MvD$c!77HxZxs5jN9nlUFtH%TL53Z3!xob8MI#TY(&#P_ ztb*R}xj6n7ELa|f!>qy@LA8^-808fHfdLbFMemse1MBtx7|eV;KW8?{GTQ zejee!_pOh1x3@=IojZ5#jT?u}O?GxY5LyKD5E|<0%MAh^ckKv$GGs5FKex2DCIu%q zO)^rK@P4BWKW?0Sx)OKQSlSY|+7iKQh0z~vtqkN{JRLd4WWBnil3#dp+LOEL*q-I? z@(JKe)FrLZOV^-fnai}`DDbBo=d%&`i!?`vSz9}Wzl7{gc$&P7BxNjQs*muT@9_*` zz|Q_eqk%R70{%QaJiWcWCUtJOQf;@b|Edf>)CGPeq}akelhH_x@r`T)*rnqr?xJGLiJwvwQ7x`WD0P?C86J_y#J> z+Vs_XL@U82Iroe^Hj8RO;^i*S-hmE8dkerEJRg@@* ztE(&ULL=@oA3RvEg}NSR#nJFs8C2P6xw!Bd3>)a_)dExnuw!4}!~IT$GA6GEKj7HS zl$BE?f9R3l6A%z!W@5tl?o3=Z5Fq~LBA=xi10}O3>zI0eW&X}QcT%I@adt)k?xRu+ zZ1EGksQ-bmoks>=;`v)-?F#fYplckA>5DrZMJwIg8@I?)>*P*K{)`%BJDlt(GLq|^j;Rn>_@%mm50)jVJ_D@CF0jm|z zcEcRG6F*Ta&^dNInDj&^Lz6gen$+9Z#R`}}3tZe|(-f+wQOoL>#`I!EtAkM`3q`cH z?OCQ??xR@9#=E<=8jy!51&g?;Q~)CdiOx(|C3ENES=rcZtgPfweKBqQlSt%!Cj^MI z*bf(GUQyQ&hTUOi4*bo%H(g?;?yzzk0^}5sak0%qa=<_TVG^I1D9~xe_8gzDiPEKu z`|$HNwf@7ajYoHtoENt)*|owov1K#z#Bd;~oi^k~#rx$QwDwTOYT;g&>#+13%q_iG zBcp;V``s(v`m@Rwigxlb-YJdD6Y&lFC^z&i?Gl~M(^#KVqxJzX1Iwfrv}=;HXQ_Nv z9gEjQwfH_R6fgG}crh^}Kea(plDWS|fuwX%aWUoKkFb%1m{^cDViTU5`?BPd74*Tw zz`($P{{Hp#_1!_Mx+JfL^tBMHc8H$tYt^J6e7^p#o~p$RA@gyW&bN(M#U8=`Jbj3h zH*bEb?B+F&|1atOACLXb>IyB7$*kB%iC3~Gk)5Vs$^*YYUMoVZ z{W({_xLS)pBcsbOPFXRb7#nXD>l-(rL}!5j8pa{BmfSE2Rx9sUSn+-)ejR>v8^OYS zIvfSg;*iQdYX3>O;93&;k8<%UZvCIag=e4Vin(@0zI#N~UG{Gp*$?G`e=Ho;iuRWt z_llr(CR_%;@!0@Gmytz1J##B7mQmIWSZY#II}gcF z!na3PGb7A4K!w|cBAz> z1mA;DU3bNVF8|HXR&J&SM${RviT;Ia6ycr5;*!#j6#i5!D5?CE*!T8$#+Vu|%mj0a z2yf%p*DVU`V%d7Nbx`;3b#dPuu{QE*%RkIL^OKhG{k^#8V_D9SoKK2Ls#O~}79OLR znj%BCwav_BqaQ|3tj9)|aR_a}Dl6rLomCRDZ?H_X(emmVb+oaa4KxEq3lxvfpFfAH z@&j$}PkHr+rSt9C!LDI#W!~HuyKtd;G-d!3S~H>+WRO0PY4CXLNjiULJmDCr8}vp^nb{(vrQlc@m!L zDj3k~5d6gwU=uMed9hP}W~XgO^cJayc05V)sL&%?PouYhTx}JFU$1U%k4Ri&x55HV z7Y4ZH$4*lNn>;J9qQUEGzn>NDtE8xiMnEf&A0h1Me650-ENyM=V#C_EcqnqU`xdQtl=P#akGdV_wm)XDKumD2 z5;peXv46SKkra`*1mbD9^XK`56V$IsY2IKm_s|>Vww(shJkM@vzU-Snpick)tJ(l< z3iO^&-ZIkCBi-RqQ8qwR`UF!Bmd~z8hSE7S-H|!@i)cJf(;Wh66l;CJz4;XL9U@s zw;z9b55$0U=v)%x@ow9u%NnM^a!g)c9{51jld0f`z<$M3W={?^ZYON3Vpv;OOm@9G z&12O2Tf>NsZ+>HZ$1d+*!A{{SmQhM>?V*4{UTbd z>E>eLEmxA{xvDQ17h}KvJ)~i}(CN!xFk#9K%3Bg#^Ap7+^$s6SGoUA_J1J6dU6V^* zT^0`LTLyE|WTCFKJUA!dl;5|wcxT=bPI$^<>^Wg+FsKu7sHvT^Gv6GOHh}U!g#p)l>BQz}R*VscjNy=aHjsUc3MZ^aDmft^B z_q%EnkXjN2xZAWq0*tNc_qhi04=a4B_F{1@jZE=C&@dKwQ}1|Yw8634nyx(%-aq;& zThY~b)h|_9I^q1SC|NXJ{;aScpTC%Mih1e)Wb~-L{Nohp(gl1%?gnO4ZHVF`14el9 zH~A~^g^d+-_xb$m*RMA>-87S4`s1Awn`%B*01kx4?5CihP=&VB8=Dd zdcrU_^x#8vd5I-=hgNJ_aF`(F+7hnOFfsUQXA2=gNM8HJeg635`56Nd*LMs(kHJ@J zsPXSKd9pR^X-vJ({8$A`e?cZLBPQkohk=$}uKgwKM5QgoC}&27126TnXU|3|Epej8 zdcvb}qurIQz)i03Kg2RBdP1@luvlzS6O-lX!wi`=C)R@5VzePg4~XO3sST%7sH%B9s45d5mvMwVH^Mei_q% z?(KCx*j@nwq=Q}wu18D1NSKX{O;j{FajeG00we$&9l2lvokI4Ae)kXIz_*MsAfLkx zoh|-)u zhVq2gTyOHp^Pc$a*g4$cBE{&EM{Pt(`a>EZe7V#dmMX_vg#hIi5DA}zC)U=iVdLsv z5J=7P3OJz9%F1dPU1g6-NKC9y2lG_&N=k#o4nD1Lw_)Pthm=>Zwkf(i$jtCsWEW-g z=1>P4S&^sVM#Z#Qqg2m&UmG}f;%~M=XdSZC=5Tv3!%6IrR`o{himKospX^K>T8gfA zgS@@Sp@BQ2fzt@uDe`7Iz$8Ig*eJ6cOienWw7RzWGT6!I^qG+Dxmu zsQ*1=#U`$kOfPpU@pMx6cVSTd@fZ-j{r!(Gd$UK+907Fo-Q@oPVZ7ZJ#dK1Q4Ge7! zMMv_G?R2h;*T18a+j1^NSPG$4Y6*c7aXmMZf}wk$d$r&1K78><|0_=TXmLrD2Lqw) zwzE|49nZ2We!`88@ym6MlNxmuKY;kCb@u(3Sqpx*WsjIyxmRA7Yl_Y^;xu=#Mdu$S6h4@;g59;{ zxEV=oa1trD|1XF%7s%Nsxi(X)UU7XKBIc}Lkpu!?XA7;XK&%jJfq&Cv^`h>1f_nyE zFq9`P2fG_w{(w_{ZTC#4(J59wPnL?6VyTuyl>TV-;uA6F##m*w1jlHr^Nr|)h#-fH z7G03btFGXujDDs0kzTHF-lT5uIXoRLm-a)$OOfdc`iRX$e2<%4n)X98wR-wz*xRUeGZ<%KyLpTM6g70Sk{x zWRS>yBu(ltP~GS(yO72O& zgt~z4U}umi;9eVjWna8tTpNw;F)tO?6uj}5qgj%MpD753S5^)_5?tK!O2~Qh=6wfE z5E_!FRiIsTftVP0gO-jLNl3UjIOrnlx+Z=FsQFtF?L~wC({w*KE8&#hC|CFh+yR)3zoq@LVsc!Uq9tqG3U*hxC ze#OK|H#tnHYy^yzo2t{$fHe&cCieF;l*xKgc9P*%K~_bOkfJkM2|6*uPyQVtO5Ga4 zdBecauq$Efzwk8qWBgC(G#6P|SQr|5fqzB7rsrYDIUeeupNY*gKqy_-sY~8Lo=0Tr zSgp6X!*d@6N0hMkk2oN@)S!hz|8EQLU%?=_E>Qg}tCeyjpzAm2mq*H&`S{dEHA;{3 zn*Yp?{M{T8{e52W8;S!??fQ%Qvofv^`;1y_5McT3jBnUSq}3IajY9Mag`}kXLD-%1 z1;~xWME*TCLi6b#u@OZE#k2N#_i+-c3JPBPxSY{DurY@Nt6OJ(#-|s;fdy8C4+`RD zZZ#^ml=Su4=x$0(WbEwqcfs|)02?>s@b}wa%?!tRga{OYjDVY%g{Y~yE>F~kS}bO8 z!ZI@hN_u9YukmNko~4brzi;qQqt7z`2cOUUn?cv_n}6Tb{*m74WiKk3+;d1V&A+?! ze==JYU~#@&2qNj#(n0mRn7)}z;97Eo-YrJ@?V0g>iD$V55xd+LgI8cK;K#D?M1+LM ztE!4@jRbU(y>mvDnDi!o83V&)MG3vp)?jv}z`l?~!R&8gfd3F^D{A8Is;Wu$WZPxl zJ-)48aHv3Z_xfJ@cQNQCkaKS3a;7Y2En5SD2yiCAoP3fi=Q@;dV4YQXi;=bbN?HC+ zo}CaE9@iYc-e*GxhLU=$VOHkn;l2mk2!wcbANrH=!<=`}PSdrV`qk%K)I7Cio(@a{6Tl~Hj11%UyUQ&}aDIbZ z!rA$9$hgn}{P!z$=L0p?4OtfHx-#s4DkZenAHG9OzbpHw#1|5jl|p<4SOXjx1hnC# zd;Z&s#r5Y(PEq}z@=+fSCY~L@c^3Eg7hwIKUlvNO z*Q{=W7bT%&Z8i`>NTJ94R&SzwSGbJkRX#l{US7aQQZB7(J@U*kHZ8 z+dXiWGx-rpzUT0$0MnC>W3LFV%G<18V3?sH23OECT{x_Sv4E1(Gjv)g73_D1VrC+$Ijx>KppSl2^B5qFbvFwa@8ct zKH6J0NsqV!ugH(1DHohsYOstbaJnGID#j8>Kc8hqs$|$T+4BJb1cr~tt;ER&lXEx2eyS#CgQttt)345Vy&8<>7Q^mk(xk{ zE-}pD#6;d7M}{y^NQm`%3i|srRy104j6FoFTHQR%p8CV+{f7{g_~VJMLc8@MJ%YXv zzZ!d*7d{({>>~aguf&UBuXY z!U+uvo9pJ})@!^b*eNd$*>nvAD_6#S_Wo2iqpMAJW;t((PkJiFb>pj_sOl=xP9Co& z9~5uA24~8_okz6>pl(u)+RdEy!wzhQv4wT`C+sYND@!=*kQt3BGvw- z!M#+&5414~MsYuSpL7~lt8j$8IDgmQ&ChcC8Ew_>)&0eD7?qC?S*3uUJLz}82vW?V zY5u@3W=n^>J|fiyifM(d##~MYLm~?mKt@wFycic)*=paF37IzBZK3~K)&FF&G;0WYXP#A6gAxH?PiFtxVimaR* zhh%&1nSX+Y1k;noOAM`sovke|ETg)i))bjWgmJ5M2P+x&xwrkXrD-tE+5(nOvZ@lU zgE4~YTvr<(^%Efcdfe#VQNb%+gj6BxX$cMG-u8}|(?pm%A(S>S$jhMw?JrqwgMS@ z4s~PN-{|##^+v`Wi7jv->?YcLZ*^dW#d)N0!Lv`ydNw4Sn%UtSumbYz(9#-YrOL)l z)IXh^R&;Bo9bTmUt`9=F8>8v>Hz9k`DuGkK@)7U|+1c5*hn-s7Ta#IxMoP^97|Tal z{Wlc+Ur^UfzDyK!oWMI^(KddfApJ1>NWoLwdX6>aoN|XwqK1X7hhSdS=1`4WP*_`v z{6pOBZHo_T_Tglf?0w1xn#JzY3 zwK_0!|jq96yx8>zNV6LT7}>un4se)0lj3-pg&dfyUv$^2k>WKr`xYSoZ%5<3SRrTY5PL7Ox5-M|Hs3#)V~J1=Qb^q{*L zsK$;6w(5_)MW(~YcUl^*Z!g>ht>h}W{TfOJy%X3e@Fv~?gP?aqFCyj09%m$X^ zI#M%&CR+(TqM(m^s~s^!0#nlY8q?urxHRjn__Sf|z^YMnVaAvv({x`sK18tozTchY z#dop`R8Zxe^M*bVZUMzx-XE+LQfLnzrpHk@8Ng_I;1uosd)i%wclXAO2GUaZIt(HT zQkFH#DVwYl*A;eDs{v`VzP8>ap-mN+@PW=D;d*281MEknqH!i_@;X{FRM`RZCynMf zvy%Vv)2ETGxfK87i8b;>ZX;a{*VVmwlkSeOdK*dOEjQCd*uJhKaGUv6>oR zF5S1>@89jjt`E{B+zi~PoUR0?*PKTDw<_*NShlf_^1OoSeWZ<*EtLt7%_Jh=EtVXV zWtx{`tXRABdpuKPx;Lxc?to&wQc>dATxcP79>iLH_RDpz4Z2yE-)Cy)hI_dd0gT}yEF=iaSNsr@BH2CKG7ystmw)a~(>Xah}>Q-(DUb|DKy~V-SEWnXhj1WRx5DgV|F&_ zz_@L}4zMi2FEIb05+;~N%kNz44e?Zs=C+${mY>siILqTZ^vXnzUn#~RKdp<-D!}Ay z%2|hPl0UYkn_e1zmN&_hPa~nv<_Ly`KBzJgMI*Zg3eU&N)`xq}HtP!0jdpyrHCPR) zi|ja(I=4Eb()yTs?GoR?{Yk#?0a~6E+ws1ylIZHtGFuI6uYHczRvtOIV1wfS!`NGg zMWKD)!vh8?2q@A5N=t`yDblGZ-7PIKbczBJBHbV@Ee*pEf(+f#rF2LP-Mk0A*RSaP zJiqr(9~?Mya__a*T3h1MJ}a?q*%_t(7q_*mJdpf9su|`3tbJdAW#7ByjXs-cfaZe~ zJT09l{e|t^zBuSFdY<*J^0M-S39mM_DePjDOu=THy|Omfu*UPg@@)x^lxVS5(}Lkmk_mWHqTL z7%)!65cvS$Nc*O7#-_^k(6?A#yRN{Qv)>~gWnEfkSi0U1y{-rUkv%iN|08=gCMo{k zG}-p@eVN4&pt(8ExKq00!A_dL#yyDG$+f}Sup4x`Ja*%A!a4}dWEDDd+E95}$N=W=sqNk3XUtvMscFKnn?+F%FV_}CN4de)N@!H$l+#l?b) zmk$&bSHgUv@&*jUkVWgc3|L9)w{dX75)&<#3S&>AE}-?6FAo;Xx0+TV0Se^f>nqJ3 zO!*&FWwAIU%<)j(Prab!_Bt~9-+ej zmC5|hcLR`U0=6-y7AE21m#jeyBI{~}5S#sEe{)xywI8c)PR00J8Nwj>wN5fog<4Km#aHEh>Yx>5Fb2WlR{tM4KGIVX_dFMw3L&3%kb#I14z0$ z&m&36mK4zdpS6fe4i1hkp;K2?)fl-<^s84vv8UuDlH!*H?%xVZJ#F$*OP2sOROlEryyKa!tN!O*k!C8x0S{Q|VuLIZ&$Me{xAoPXz<$3AT#q|$2T>!;l z6k4q5T4Fag*w;V$Y!bPDC~)1_w-+3jGe9yqxs9CMDD3r;-AU}pi|2sdk+a0A#cQE5 zWRqGb%J>Jj!p29JL#uZjL@0SmhXLA0F9k_KSpm4f{=_Z+J5;_lua$3koYK-;J~-w; zqMAa7TdJtZU3L6w5F=^X9hw?pK0ldfQBXN#ou5};9CaIdgh4i`jv8M2&D=Hkjx zrzqX>Z~V_eU;Z^+!oLDqJ3#)* zzY;IJPo2^+R*Z>=KO?F%=r6o4D6T$?TdE0=RC}SuAMOCr_h}Qoghg6H7Jrmd`JihL-I`EB~2c25|Dv`%7A|?Z?s|wBO zKzzyW!0cZ7rmOg7(>!y3nY=V+aXIW~;rv^-eOH=lAK;~?61T~-%W3m_Wn;HP+IIc9gwsb(NAV+V0f3Lpv}yc z2q_7P=C(HdgWq*PyfS*mchpbD*WAU)JPt=EL9JI`saWdo>16z<+5}*gjLde<o5m`i0> zy^15?!slEn1oJ7aJ;>@>BubBZgn9}^ux2MH&McrYT?S;!#PTHpBeCY2hQ%H+lL~sv z>V#W80IHHO16qR_AIZqJXOVqBnOxt8EJnT~H`k~OA{A*m8fIg@giCUebZgr)ylyEQ z4oLCWK*%o$ZQ@4Fv8T8!$(NJ3%OY27egwsg~gSQgdqHF1#&plSl(d0_0O z&`YiQp(6WQ59A4+QL%`tj2$bghPX5Yc3YrQOFbzt?wlIeoy>d5OQ#NqMx z$$vOB>9>N_@xsFYfh1Kq6~<12FZeKA-#<(8aWspj@}EYy0ld8=EHo6zTHV~-&fLAOF)Ge}`>Xs^regvkb_p&ahuVF$3{!K_d@((Ttw*y{ z-+BSQ%F#eW%M|@N^y=N+d4L=*+n6p;R!2D>%`KziYd?BptbLGOl^$>!`Xmtmva&QG z1BS}dLo!Cv__W6@2AS8?h}tvcrFjypbC+GwI4E?!eV%wrn$ay-ezC$4*3D|`$cYly zSa`e^EdfMVRR@)}Uh|nS_d7tblh&)bd?B$N*8aH51hUR~D}5?f>Xn z|FNCMFlgqJ`l7i(UwS2whsa^(va05`>GzbGt5cbAnyD9j8v}%bM?IQBIK*e~48R+6 zvUh;KJC=S8JNqX=G56N(QbgNepQ%hB7dbgOH8r(>!1x#aMNgUN8qHdF0T1JXAOxS~ zoyS^AJIA%Ab9Jr>>n}k3cU~Z(t`p{OsIFM}ro!&Q2(@ff#{(mwXT9qT0WWN(4N(k& zD3{u9t8l)7mMU~sE88Nos9W8IWh?FU~QUG z>Z%Vbv*FKrX@lHiwzxXIAoI{hlT?cVYgc`MQWck|<(X+>nEbQ3=vAr4;}}i-!*7GS zaEwb8f($!JEx)vsH7I^<*lhP5O?F1|aEz@a45TNdsmX?NhG7RP-V}S^X z90n{4a+G(wE=<;X2)Tuey7}V%nek%yoA=!Fy~HW&o!;w~939Xa_!f7jrWG~Qjx_tV zptBruBX$+1VvmTvKu|Ap_MwitTD`8BPx@u1#@0A5)tQ(#^lgZj)d}-_assyDBykBh zEMW2xbFCxeiIGffTJXz~gy3!V?!<{);ihWq$`q3>51pc!4{aGScgr!z&&XJ%+cfU* zd{yOmD7k6pzAdS(9xxj2e!R-+PTz0SpE?zp*zCu;foQwCT`Cy1zvO?LL5Sfkoir%v z%gD;k7KZ~FiRw&bZ$m?ap%u-Pg485%jxU45<0D3j45b0EXrwbuUz5&m{k6`kqJsNr z1l4QzZ?Ey2ZT8hefVw)-{lH|g&E0qZUn7TKn62*PT=B;porH5pbpT+~(r=34W8pKU zY)K==A|eczo>xoC)K?-$dT*MnDB0JAc2sayW98!EwHX?<`AcxwMwY!=)1x|8jI>mQ zW|dWEmbQJT9;)L7bTzZ?`)iHIh182Xjo}hDqe;&{=y$e2%RM-4=BYx}hn+QnJO{QB zY4xD-LM=m_IgD+%2+^NckeVQB=X7tFT+7sZ@Z~HK@dV?RTln5mfS$4`uj+^vC};u^ zrZ2;nA0^Z!GdI$r?Y@C8Tc_7H6+cs4{xg-r@K3)&7Z=pNn`)W^)8W^Uzh_YFD!>>p zxj`_u?_kz~HQXuRo)p7m(`L9z7Zjqv$dt$=$c6Ti{T;p5p0U1dmX{oT96ksaFr;w; zd2$8pvRklZqb|jy2S8(n*}}NMgD>0hyt%+ksK1qSb*i?tC?O_>Utg~n^_g~)%doHu z3|7P2i=6%DUYXz$WjFodeda!|(Ujp5y3VV*3Lq8R=#49LHnU8+Q(vz(*n zv_O&dU5Kdh*8Jr|fpor6zF<)Cf<0fX7U|u2x78b?iRuj~p@wpngZauvUjL)+7lP5tkdy>%?;u1gG`YW$@J&Fkj%k7gc zo5^DiE4u&Mg~O%eW*nw$-cn_E$G}12{;@L2{D?~PNC}l)VZvldv0@DoIx~4F(^dPr z3wV(~nPq{Ie>2N!#BV)PlcI3q3&~}o8?t|nA3&f!QkGpyMX%tfy1e#(Pb&5+T_(Tg1YDmRK}@BG%hk6lz>W4~qGxF|r%icR zVjB6!z4+06yksCZ1u@`U*MP^JEEEQx_G}6+BWMpZ zlLM4@&dd#P?sB<<_4}yyIAaqH#&Gkx+K^a`SM<6f*2}Pb zd|uAZ)k#I)(x?bwu)PMvLL~IbwIHHkSxeuvyM6i??+w)=Nv%$$bRFefe%gB=F4}X} zq#sq%w4Cp7DC)>D05YK|Q2Vy9xzd(P7>(XRV}cdJY6RE6`Cqb*2Hyz$QQnKoJD9KZ zxdR>iqTN#02rmDR!ZVAjic{fAbAz^-f#D5i?0_@4j_L)7P2XCyG_xQA@o_W~uKfuU z5D4bNhwLeJtBecv>DQ=G>sdzvqt?9m7w}s3F9Rlp9~Drx{GWz`qWkNO%YSNDe6Di7 z9{L_&Ax^L(ZDMH6j2+%%1W)7|Uu3P)#_*{@E28~LnzM|p(0x?jn~si_^IT#Bh%1rp z?0iwo+VRm9^*#XyTIWkrrckN?JjSWDX~hJfM4wOcr~&t5W*?AYQc_c^d9;PV{;Un4 zKpy^}xLwhOhn0_&0PD1_T>lunTRQiGt2a`A5X21L0M6WXx)eYT1G)c=8(4i0o%VC? z-hFx*Pym7Di*%4!s#bxq)3s~Ya^rPXfWc6a4RWf{?p=7eh{xu}2GEUfZf*iVKigRo zZxt(0nczP-JvSO18&0Nqc=uW?xYRZ#lAEz`p5zUzHHQcRUCjVHmgryJTv)M_xksOW zI8?-9ssXtyaIamw!_Cc2IkRDvl!a7i%~dV7<0WC1xL2eAU=z@PE8G_o2oYV~+&aW8 zv#8Q00n7cn`4`VotJ(xixkUb~uXD8jVFloS;rLp#VK^~!L+&o7sC%@Z(lfZA!|HW! zBP9l*|M0mQvi$#-B^yGa(0|k9YAqqF2{>S94`1b(DF>Kh(H7wG0j~>xfB%q}(f`mz zf^>QWyWWwIZB3Ek9{`E`Mk&=o_7l3Z8n$wtRNoIg36J>D5zxWuh?s4^fF@TjU!o?+ zMI|IuTaNP@RVqwViC_O|c!ZN|W@`GuPbo*>pCnYLFZ8^e!LZ|wIwlrYR7aE%qAkez zzZ9o0RXZ@nb5%PsUaTon3%Lqzo;iQ0i2NVH{+Z%R-{O%X7brf@mQ_wufBnmw1hnbiUZ|ItIv zzM&n3oz^a|3uI>&Y4ofp-D?G6kcRr6s$aG7KT(1?G;EGZk> zE2H417%I~0OUNlvvyhXMv$MO;TnfCx0XuhOL`}*#(rmy{R=Y%+MjI4Encb1-bSD!o zYlW&EAoscQm*|8B)U1FJnJ+~OvQ;BWx8TUYxVF?1n69hem!?zr^nXTG%0co&8Lb(7 zG?^J08>+29#KK@hKl&u501o)G`hP}|2m|Ttf0d566&5lRo&DeB&$Ui;x*Bc&&p`kc zR!0Z$t-4FyM;-=XjUEHD zr1N1wPNOI)PM8p5g@0KpBSL-T(DjLh5PgA0)zm zS#UQ+ejp5=t@Bs^CBs?0_Xc!)EjqU*ixKmJOKlt%m_|mQtqJOPax>Qg_Lf#dC6G4* zhYlxA5ZtNjYh0iKZ%W3mxngKMq%Z?l)Vori#?7-ZYy^Cz>MvuH&|y9Ax3B&~(+ zCO;p`Q(U$$1h9*|&*?|{`(7v^t6%<)C{GSEP+7yL!BJ8P5D(*q--mbVQ;}$s8a0hs zjS3A9y6q4QO033D7KE;t8;`Vybds+|%~R#XEUUEw0LnmR`dcHB7fg4=wdgeMKi}7w3fKq6#}4)Kp&G6wvGhe!nIJ^sp>Nhu zv)!gN2%R7W9vYennjEKgz@a4Le%LbyQu@_`g|Vw9TPi-!92VOr0uCJrfU3IIaXt7^ zr9RLGG*J>Hsi6KDiNxO~D5wBI+Zk0)+oO1HsXvcDW%P;lyDp2lHw^yLOd}u^#g|K} zA~OzAH@UG|&JcoTm0lbIyl|Z-_ul~&l_;zcmNpfY*Y9JIQ?XhEbA0UF{c-(0FeY+a zNcPnNtv#zuw=lK&@46xz{Zq>PQ@+)}>Rdv{OTvQIjLvRDSBk0vQ9#|VI!hk;Rgyqn#JCC$}~8;gm@Nn$Y^yl7=yi{ zbR*t%*$6YCY&!wp0G1P?QnKYA&T15>j6!Hd=6;fZm8i1Q5bm;edo@RWV^oM#U~?sT zQozWt%7lgOV-iNk`l+e;(7vUmBozsKYeC>@52m9_--k2z+P?;AHr|)!POG^}Ke`&- z^waRdFisR1Z1jz^E%x)6VyqWByS7d{Npv3|m}+exW}QY>7!$PJY${)KF@3$eFd>aU zyk(-eN)YG}4LoUX#*Q)-I`En4z$wsw*LVN8f}?cL$&L(deg4nHBBslgQ}YG=S=Mb* z+{~Eias1J@T*ljQXxoTF+M&G_hY*-e>0IO<1$k`bGI1{dRt?McHwuS?AMt($42m?ZmaH?mC=?%PdO900cIAIqpmH^hdyL8ZfHf! zmO$EW%q(pfYV~0I_{$Xie>hNlk#$Oa2Bmh^71sBhE9jYJHVD(I(ULT7h32mhvv9S1kANje+kR}J-V?d|Kv|; zZHFx7<#vNfx^337#?)q%q3x)}lUC}A9g9&e6R+a-d37!QODo}io#O%OMCz0 zN;%UJoU2s}FUQavQMNRSo5fE2G^p|)T|XUi!C9xxwmYpIy8A#__~K>N>&?8LL@!aN zaG4((GBsCVzj(YVs^nG4#vu2qcG^1yrCts#84Wv2~>?Prrb32eUK5vxl}PsitQ|GN$n*$YQ8O5Yp}{Sth#N~r)2lt5A30%o>X(O4WsHW9QoGR-}iDso~Pf15U{WOI- z{n*fk_QkVvg;qRQ<<`}yAx6ylM3Fr+IODWoJ%c>wcA0Jig!}v#k9tgC6;b7^WEuIv zzYRVbCzF}6f0ES^34@cp$?t6F4zD0*(u%h2&S=9Poka(Yg4 z&C+SB<{YRM%^Jnb{zxT%r0K}}(;#mf)iPOme{>mU`1-auu2pSv%^*|aZka7$iRr>I zeZU$kqpx8oM3-HIm_3ZQy33;V`l0j-5M4tH#~ek4w+~*YpijDr`AY{sJK6T8Odq@^ zpU~wfUru~)HAqOnhauC|Y>TihMOw*q|M9@KtrPC@WQ&rFxZv)-%m^(RF3$EkiyTGN zQAP4h-Oe@S^_=lT6);cr=^T=Y9P*df5hutpNMznNT~ z;T?dc>REt`A32NT{o@UjSU3Hq7Vh_{L=mEbKIMw;g`~q!d)Wr8;RAp@mVlGQel%~% z6S!_NQ)&+1UdR2lyj!dNV>pe=`0G#8iuzrGdii1MaQpnnQkr)~Hk|=NRbUHVF8}*z}v)H0keuCGW_QG*KUa%8QTtMqBVLwu$rpc8;=TjqdSQFFuU-IMl?p=`TMS%L!Z8)oZ1 z!~Cp%_>i_gR%*b}5Nqq$euUZgu=T30Y-`_AMks7Cp9;S-&sCbffEKek~Mdfb$ ze66EzUvs_JrKsTVUu46nu(PJuiJ7Zbdl`jp^F4iUSLKc5#m$b{^4dIj@brKNOM3NI zS$Ub^3=5?5uQRnQFQn^)xHo>1VphT*5zbvffYokTW6!j96gxaIA+hDk3CrnB;s|~f z73?3mdvAERV|$7`F?-4MM7}6eJrQ?zC#bnOzb%+{d{qZ7C)!TZpKdsPDK+Vu@t_{7 zpF`tkFHJSh!ox5~-PJpt<=6xT_3x=FK!^`hF@xhy5sL}zVOVGgKr8l^u^PQ8%VWP# zsVuApO*B*COIkvwp`vSsPBVGYo>G+VPF$J}hDC;X!i~#OOfSTg;Zj_F$`~EnjP0@T zn+0|Xr@d>vT3U4OLuC_wQkpn&&ibBq#7z3t1P3QE{VgT{GspFAy3;qRo!AR z^$f%N z^!6Mquy+L$c>RBm{pg<@$>t>KQMq<*6G|^A(e&Q+=>c~`tJc^Fk93AgTvQ)? z<3Maf;X$mpXJgKN6Daw(OAmU$AoEOMVYx|;W%x`v&S$gPqr4l_ds^d?5tqmi%cEc- z@U@4Xwv2f>2nxcmJ5Ac2wdi551%9f23M(r=Y=G3={bN-CTAtbPB0@>i@lO66p}ouO z=M(9wnt^s_YBQgn+SAxF(_N#3T7^FQ@6~qk(k^8SY=xB7ifb$CHA*D*%50kQ56)lTx@!f^kT9^ zlwVhIkLGR4uIz20u*u`m#Z+R%VA0{`UqgIV@7U(I=wE}0a40akHc7+W@>SAP#`?iv zM_JI4&qFyKc(T}Pmkim4|Cdxl8yE8 z^{%e=E3;xzMi2(HKR-G6`l2ZuW4Q6NsF~og5zROO013%wW* z7S3;OP+yY6Mo7O!?pBLu6pTq+^R=I+VVS_bi#3@ZJQ%t9d1iDsu(tAu0CxX^4$t|Z zfJJNcYO|x^;H-G}>d{36@ilX@njHGexOFJSQ5UdH&QHhQurJ=L>sFI~;N1O~J~d3I zxj1^R5kA+b#kjH;Z4(r+c>~uyz0Y5G58n>jOLZWfZ8a~ibjs_%2SP!Nu@Q)xRaJ(CC z(PV+{y@;5Uw+Z(aPBvUL2yU)1F~$uYBOzZOV9%=qw!UunO$`9zXzIYrrmK|MFKzWH zTi(i9BYLPne}}ZP>XLJH>v+16!#<+j(t4jzozLNqugC($)?(jd} zc44px=^iKab{VydHme!k{oZaEOEU^TiMZQ5aRHnB&$0Kz_*;na%Um^ zETH3H4XobJ?;<8ZZX8;w7VANj+B(e-wlhKJKdg3rTy6z{<|LW=j@Wxd+m*9lnoNya zTs6XsPo;L+itXZybK$G``EhDk!Xx4wa(nNpZg#1e9U0wUW?Do>B-ylzizRv4j;HL)%!j>o8d z?AftjG8LxFMwp8~J)7I;L6$u9t+p|Dc#)*28i?=tj=Z3X_z8?&Y+bDWq8fyo^!%tF z6EL@!_P?#Yo7^v;RphdnuBp9QwmynJnR`b$w|CB~HLuDv)K;l7$9UH0O^J2&460f2 z*QKn6Ms1Jg(Ftdww!STasw*(?-7dxxABRzYUJ567?b}-#lk4ZTGuXCDbj& zb0HHs`$C$v8a6!hRo4o)E2P;ap)ISjToVhQ-FaLX-aqwoUME~pLakItczdpg3x^&_Qbz&qN)!TS3vI|p!?OuT&sbufJQ_0pa8^G9CA5}vYa)$&QE zr&Ptjj(MdS+byoqJHC-BhC!?L@lYzkefnCFJw_qg&~h6qgX!+E4`_+`k2BvG5#-YI zmw^~lDI#OEOUumMU7PgijOS_axsKj2ifj@l zc6Vp9COzvNzdLV0^|}$i1=?uvbtVklGi)|?%3(L*4WWFpi%I5Z2TAb!L5d_Uc&ETN zw5(jP?oC-TMx`>K95UC>A10(ZR3Wm;$RIY+DtPZ^t?TXKn8nGC-R#F9imH2gw4u!y z?DhJMiF!*$C?q>$t>vGIO-QS14mR)R+4M`vj@o_`W#i#SWdpvoW1t6WflA?28?)6> zEHHcPF3_(1CwtTB(ovVyfVXCpRxB^nbg12R zd#S(naD45k9SMv_*5~EbNPUW4qxFunZYt;PTauj)|BMR#KR_*KFEVF5zkG-Pul1;5 z4=X~pHn`2c`ba%%0(~lZ6^L<4rsSW#YO)C#(wrrE)#Q(YKxnL%)<2zRpXlqzZj^uX zr7*as_MxkuU?50!WjNT`pYHDY-Jk%~DBSM2NP6YB=8PCEE#RNCPFGfE&F4Z0N#Bv~ zF7?whGFsHXx@sTQNCXUeOb%LugdKdT9Eb)p)kPNbeK;IZK3+I~^1ywO<>m%ums{My z*Gz@7IrV8;1)<>~mQ&kP=Gm)qj>6C4WRv30*YXp28EC>}t4x>e1S;g)o4Cj&Bqdd0 zb|#?D>B(M`P??p7ekGS7{Ulu`^)*R45L03X9G^MGjDQY&TxgE4tsRW%j z-bJ!=KU*YRrP(NHS4*=ez)KAX0oytLuj%8@M1m`OUNGq%X>KCFNX}#AK{|T;M_gE2 zm{6fX>!Nm#iz=xTJ@xyWZ?p^J#xzxP3GBWt2QNHC(oOqUak%q&vA3ru7B%yloe8UOGTgL4>FgM^W0@}UvI<9{O2e0^v z3!ja>eOZz~{607EhLRW9{<2(9D`$0!n3zxRv)`ar)&7k_+ohu146Fms64w*ZPj#*% z^<@vzxFa?qG*k~Z4;pDsdYrm-7WO4>dAPd^dXS7Y#E|m8`0BZUSq+%mhikP-q+=g- zFPsYBhQ{p0# zS)fO@D%T#$km>YV#i?@fN2`5NF!O*Z1 zGLdpDvM|{$^{dCVgT*m@hc!q9@l52DHS+2@@gk6KbZbv0g$73F`LnKkU&#}4($kv) zCCJ%nzuj4rBom5ysKb0u5;cf)-RonG6SI+SFwssr5w>S?0~-K<8K`BW?1iac_^NaM zjutNG4Oz8j#VvmPxQB3vJFJ+%DM#_c2fpxp^uoo*6U)lX+90`@*Q}ORPj_PLGS7=8 zn=*wpe>@UPtVdndN5=KtW0q&*f!>|Co^_&lsMdo3+TvikO_wK#9mg>K{`kR)u5K04X()xty!u3w+DyVRl_J_aEd|V zvh?L!X#84E$r5={RutB8-BX#CcPz?&R?SycLTAh_3ALHcILDRwPnN3_Te!_L2j~@@)YEa%d>iu-1Tcd+Ai86-&344r|np!M29qD~(CF(gs>a`RB zNo^>FNz3xgOj@82+P6-`hTsJ4^pt@49HPF>Kd0D%NgeyCQw*twV8*Jetc%a4H=q4? z$JNUDW*CL4gW_TfgB0KMIW{?wu;ozXh$cB??9%znkT$S9Fgkx>S?8rnu(G2S5Pj^E zVg=0Rrdk`1b1U>E)}eY}){pzt?oFY0LR`Fk%Pun}#Wo2+DfENTQLRa%DZ|M-* z5)}~4sv`)KpWVGOZCH_$N1$bPTO|(I`o+iRs^=+ljXm=fLPOd%-tm}g+VINqAPR2r z?>y*WzUo|g=foh8+Uk2;$5b96f-<19-P+n3P~@*~t2KQBSYyEVCvOtl+Y8w*jcdDO z;V7Z#P@I1eZ#;+Vy`>$MzDfF$Q)@i!GlNk&Q)ENpcMC2L%&GZ>F*c@&20Hp$e$Xic zbln{$edTT&rGJ9W3*^qh(K`b>XyEVf>n3~)P|dXyXqYXm3I@+wck&|&iv+E4o2nZV zVXEsOa@H`CQMu%)dH${yiA6>R`)5sycLQ=FOW8I>A7uHd9#kyNc5TTkO6@-#{aJqsSel zo0IF|Z7?)RHld^SM+yGLA-!VV0w17FJqaP!=Iq1F39)%NkH-YF#za+ zDQRf12v{D_yn~B$_ohqFqCpcS<)nMtOpTpydteIp9(MArtqTI^%0$j_q{IZgDEijv zCP8-Nhba;27ak32u+xl~qg9+Jvq}s^ZDy*kPd?F`>t4f(}S@@we zB+3vn+!+8SvQ_mX*l+3;%zsRnPYespC2yL;|uK-tUWQ*mlqRV z<6g1r8T%InfQ&Ub=5$cs_}ZTQYlW)z87e}!*pBU{}Y%#7l z(sKWkZhnrF-EKz&vD0m~gaDgj3zV;pfBja)5&p-OQ00*d8=KSDjAVyH{{a2Fg^#+E zhy(HW5ZqYBbJpW`1wwtO927J$4ad*UDZr@|g(x8LXy>LR_$I9b;G4sl65&7EHu6bX-q?6Uaoadq)%cOjXr?+O>O}E9x z!V7L>jxUjFaAe7+BP=+A1C&1Wr3t$>*hJ)LGFQHsO&Z28&kLUnNX+hCP2S*`evWpq zsrD_a&DXy9fUcG#A`M`Lv)Q73oK^WUAl;gDi!*E$Zxq=ka7}%t%#2@<;F4FtQqgq{ zfo>O#Ind9)+=samlIBcigkpyeQ9QqyuD(XbVmeVV!arKSd8f6b?z}#h*|*RIQZwqj ze#-e<)qX8Ds)m=Ss<^aAwmGbn&4*y7O_T4GmnY;V#G5w(3c=7{^6+%COy+^Ubq)>I zw~JO;`?`~$`00s1sCjtop0K@SK*a7A7m2>kq2Qdi{$)RK*A67T6MKF7%{Hy&Af0A_88k*=8EOheDHon3m^a$Z*d}#-;ch$l zr;_4TQVizMbXQO*U#;YY$x76t7g-pCo%kB`+|bemyGCt}T9f+3lJ%haJ;wa{l!8QF zjiYT}!~M@XaL+K<=77cF;RFQ?v)zKYKcQDBCcb->P0gw3EMvzbCy&u)Ik>mL1PQ@7 zzoTXm^lV9u#CS_=FqrqG%;tpDx7J6xl7>0EMCnd;%Bw87^l9C4m0U&E&n<%{1F6vl zFj`bX0+^s9bR^`3GOLfBiuiUZ--ZZK2GAj}++ArSZT7t>AhX~xo|kCy*bSmH%+(eS6eD%%G;XA`v9 zM@Na@BzB2C?B6doD!cfkrN+tt7hh6m(uxRpCKxwpMwNdT(4&LRxZl$p-CgeO%u2~| z;`itc#pnr(k#jE}cV+8`Y-suqRoTJ{gy2cg~Aog^o=)vg}gwW%6V0 zIJby~s*gKr{+10+$PS*l=Khm0_TU!uUY|;5dj{Id;`vpz;uyAU%0+?pixicHm4_f% zR^<{?-c-?^j|^(sT^s-pm3P(l2BqQ#?JNLp!&Z6CU$8;mo`}}Yj9W{yZHHxA zpnJ;oN{iEX{MKKtTY=)Se5TBVL~t=>!2N>gGN~|XQ-HHv-L;K;T2yazE@AT{yCIIF zZwZ~>{Y6d2e5E_T4*_!4(WhGh&_RA|vvb-T~);RN(0g{_=M-K}Q4X;)W>mHj;Lu_jB8b?UTg!zLJic)l8Oy>N-sRjfR6p zlI2;kuqqG&K|g=-zqvn+TP&bnaU}oh0;rbc>WVhvJ9(cl)75+?KJq&?qVK+!?_yIq zg^w~aXWf<&z$jqj5Rfbos$qEY7lJCmu2uNeB8&cj_cDQ1mZQw})RSP#s}2G#-?UfN z>^471>lv>^u11m+arcgZ4bAm|51xR;owQGbckyhmhmoJ zNQx`$Z%UEk-FupH{p3lVKma2OZgTI zYu5aP{M(HBX$5`Q^DX7-eEh(%>ryXYSxXFrIbtV-p+IWRht1Z}+B^_6R|-MAyq=!2 zC)GYkRtutmmEI!j)IBu`N}sn0;vTUzHWY3Y8{S)mLaxdI;rT1b!V3`B^DoQGCsj8B z)DY4}(sHGSYd0pRg+ay8bo2ugJ(HN15846Kghu6)LEjj`QxH8D`d~xq$j(m-*rowD z5$dMAQSoKAZHUJjRYOI)>oZi#pE251nDHUl7teodmI*(q(4Z&`(`|>t0r%QTJ67UA z^q`B^0+{8rLP6Fl&Hob41i8cg`(MF_aNQXwMpB=}<0-ZQh%F(8L9GRJXZyJ(y;7P3 zUc~mI4-y9P{CaM~LvCX@XeQR!q*{CC$9IZuun0%T6>fv@vyn?Qg|9}QOG^qL0(lAf z{S1Pg$WPcmvbIZ%Z)w;=GLrff+|VJMfw#d~XVW?ftBD0gAt4(F^s2T?fADoK)k%Vc zEn36@d@+%76_kPEH!2biP3_uL17u)AC(u%pL`?A^kI!W=_z<)@NlhnHZIm}mFeY&} zm{%3}j(Xr#+EYY@=v@7M?$AAZ9pgg%!%ZZDDg7*8Q7kjbD{wGWhK;{Qg>a+&9%Ws6 z@I)ePE^jo0IIFF}E^|vBF(Mp-wP*=SPDk6iU%~X=`aK(GhO~b2h6d&HrHLz?=C81x zJiYq93gXFfZl^##ye$6&iw>3IIs4iA`Jd5#imdjP)%Uv~^41e@vHj`H+-310wv0hlpW zVr}jTjFBW#!VRw+dU$u(PmocmsiSo_m5({ICN-4l|b#+bcxqrmYD2Xaq); zgK?#-eObMU0?|PEJyt};zIv9yw;$DRyb{CE7-a2^0*|2Scdn+x4ADhIkBD*^i5G1y z$h-C7AlhoS=9)RUD4S@ z@4oy!qK3S}3{3)cl{cmKAtTTWdhSMB#+M;OnCI7z+)f~d)pYQB_ux2K2JR%Gf6KIB zXXKw_0ij&E1WRlO#V6f7O)eIr6S~BBj7|0n5OS1r=dw;MP#^{%6UWWtjNh5?ukdW8 z3xVt%Y&d|Fd_LPkW9NsUa~Evrt?%u@Jz>MI|KVTJdCo~L%AXavz%an=74eJThw{-M zyc<}t`>f7vRbeJcZpU-2L9sMBNnlshI6xjyo?k_km;5jA1!Bf8yR89)Fxd7m`r|bS z_BnnBGsIAQX7_$qxF{&G-4uw?eEk8kRbAj=c;zM>;(6u&;uj`T_q}h-wMRC<2?v-; z_Y0H)5N`5#)d66VNv<1`hdardFdPK>165jZew`#Q%Xj5GbkMxo?*(cNJk>(!9%osH zebf~Q#PZil>EMHkR8mS1Xo`3hhx%0s0ifr;6ysvFjfxDy4|Kn_FHr7vkr{GF@7LS- zF`&5}*{cp}Qvv0&quV7WeS)CU1|&@*axyYU0K|G18k#an!^PzQ3Mc8S)b?66uKWJD z^Z;&*OiC(v-MEQD%a>N=ZNxNMw0q_BjZ^m6oA|E@tQ8JYgWXz%D3CjMer?q6^}a|S zoMA^vK|bhGctS#lJmDx5Ao{?RODvFp6?DKx(vZB8%lk4O-4VwHY%$_QL0MRlDE|rD z+wTwGtG~VyhFgh8>4@@sj;RUwG&Nhj(g1K>87`*T%^E0O2Av^gUJ`>sU%Ax?IV`N) z1VH^rrPV(VYvu~i*oeF=w@7tf)+W>gC0)n^tbY%vFv0pjkMd!9-+>hzw3pVCxA_9( zhgnZEHxx~bq3OnYB@`cq(- zuL-le=d}G%x8c4#vmxoHvr-~h1ZN6&DLeO!rlug`CYvg7V!_a%wjqa7<@=cXjiraI#Z#etCHxVZovx ztfub)<82c-?|9=gMI8`l!)lKb=d!TC`uCoJxm2hS2+pqqC>$hVv;7&> zvYW6#s5-W|sL1(fjjn^;He$hgMgKOl)$a1(+PK3c?%`o^`^{l<(D;2Aw3$y_Q$}8s z^y;j=M2eJ`=>5AtnZ}rku|D~#*Zhyzfl)KADMI)fTSQ= z83u35;f>N1@c5n)r_Rn&9sRF2I@!liKJ!KHr`VW2wr`95-)oW1KzKv;u%orrdHw%m z>@9$*+}ihHLO`XvK~zGzyF^mD1WA$Z?p6_LY3XhRk%kS3gaXptu<3Twv4L-KJnxD3 z^*8^|8D^Z>u%BnGb;osIcdS-4SL-t}N^Owr5yL^ybcliF+xL-i`4iedaSzPA=-GqC z&8|bS5Z*cbeK44f*=|fiPyK;30P7I9tx6xrX+-snNhrj!+-xTyER-h!j9msT?@D9E zo-ns5BakD#x%Hs2$!?dGql7E12eiO;WpBi0A$N1BPG61&zHkVKQ3azN7 z3qZiT+sgtapBQiv?tT4-atiZm`kA!9!&OY?rfH1TACwU@S*+kCV-sL((l;~9-@Ppu zA1?#HtKeKSXXe^jrTKITV~KWMDi5dF9H@wkWWvlPL%e{g`0hu0ZJEoL*p&if=NT%00BXknvaC@eU0q<vARU%BjAUBtQfFRDXvks^LFqGg<3bbX9vu{GP+!vti5kUq0UpQq9Tr(Uq zbf~D?-saxa`;68seVC?SyN^^tb8Hg2feIW7n7(W0{`@|Q(%{xA0giq$M-@3j=D$Kc z^ll{g%Xr*L)j%aO8xq!0YLFkY zZqNl669dHu6+-JvDye956~@6)eN84Edo9_YNXL!JZJBiGiC&Dh@A z7|m=r_au4mSOma6ttjNZNNk87B7%6`UlM%NaiBs3!eL&2KQPxjma){|RJQF@2Q2p1 zB3E+1>$A)4vYFE7T%XnL`DE<5GAhM~D#0wETwQQ0%-PvlvUvi31&dk1fiI3?H`g^R z%71yBQ({@c{F(WwKBRXpSFTfjT39#!i;HW8nzZ#R5O@IijYb1mz4vZI_TZ+t}W+em44o} zXRCgniajDy=qyVlWq;}IIZ&EvM$qF@5dy=?Ednj$U{c~T2IC!FFux$kIl&8O=UKm% zVB`mZ$*RgS%PDnHTp}^iWd28=B?Uh##rQIRZ|`ih8f6@Q^HHoIC#jc|?t(q4d=>Am ze?HfWmLqsKQ+mEjh0_(Mu|yCwYPB)^>{gDV_J+q_CN$-gnG@E>_ZF$!$` zugkI9V^c>k>j*yxa?x}q@Apg|^5^|9_;x0D+^>e6Jv+qf>A@J5$7qlqS6iC)}{xHmoA437r#r?bgKJsrF-1ZOYJv!cKimsZQJ>4n$@qyLm!2(V5 z{sWgU&Au0+z_^vV` zE&vn)h)6kXf!YpuvA>YeP4^3uJOfWJc}4m?Wo@a3={!?>YN3k{U2=4+tda16>9pwF zu&Wct`P@W$TH1wMBPx#}y4;`r4JO@6zk6+&@AsMD9!riZ(n9$x;lV3gS<(nV4zHjusO|dB{%kC9} zV4N&(_jEIMwkNEgA6rn}bcsjgk*J}U`c(c2VJE-qrpA(Pk2Ut%6L^&*(af6k)_r%rA-#GM<-LD zdM)|6q3Q6WWbVq1%=X9aXP-Itak<+R^uz5gX)2;>dESR3hx-Cs=pp~MN5B1yE6z}g zQ<sagt6H2HjG}Jp+UQoqxD9-nn;#c$qkJLy+jy1<=MY+z)mur`yWxTqa-8 zV&LA~uuC=_9t6~PJg+@2t%c`8N+VN4jvRr4j%hRtRcdY$tquym1{gD4;oBOI_exO# z|0PqD#mXTM z{t$Zz3^nah(FLezp;!2D?}UcP4`B_ZTv-L`@mkGAqd`#wMimX~kvciL3qA?3XgmR0CJxy^;*34lPS|NIgqm7qvu$ z;dsMk{jEmPTMIgWA+H&Q*0V78$cCi2>Fd5zol#uaYfhtIX62>=U-%W@%QOl}Bn`R3 zVvF?d?S;pX3%~1+aQZVN1+|?i5r56jHmVfYObz&gU|ZzNo~V!hJub+ zyZ8}Sv;Mj`Y-|F)kKA{<(1ZI8MV&*gO2?|Cw~<}9&t=;>#fT}h)yoox_=`Oi8+)?< zZj`iJEqGPAV{mFXceS{4U8aT3N2v151#CjR9ETK_rNZjrASjWxe^)#N%3KNYGb$SS zX%@Fn&gh9h+(L#$jj`n%t}0B*#f(AwPbNGJYPQN4?j%{|Tw~Q~2{#>&6nrZq&`+Rd zHmvtqy7Mt@R#DJ1={Z#5g3(LPSQ_@Y`Uhnbg{wRp(`Sz^Z&P;9HAA!B z_MH!u2)(|DK=}T*;3pl4nH4~eaP14CCC^j;oW59$hy?b5o zQ4r>LTu+U@9L_4HR?=hZnql?Ot>`i^zFUH|fn{mNu|^5;LJxQMaS8lO2u zT$PgOZ#*3|&FV!-QM!h}3+?!s+)qY5GvU%M?+b;j$NcJ=Ybh1vvu8I|b{;5R9(>MO zJ0WxwGg^8ajuP9`Kkb@f;UaL2XPa2Av#8b2Ez(l(zQ(fO3ptsjk>c8UFf029fZ#Yq{+3Ae2R;J^mm`xDl;9K5B#A~rPg+zmxGQ8tCAKf36W3cmD1*UN7737?1SeEJ2LjN`is}6h36vXvUg)6)^m=>W&Iv#y!1R8OqxnS66AZ%XiU75?bOBdR2T_V2hLnVRR@R1Tii9f9A9^M;gg|VQ$=4F&L~P4E8!lpO zQMm<^;XezaunyR8kzsG;H5xm2S>u@_a?$F-^ATOy90Hh~? zp96JZ_lceKISqPEho4!d|0qsvT!QBcrx7v*J77d7Kd@i)DP8wr+IQ-FUOFKkq#okC zKAR)d*)iizPJg-IBC5H!K?lnJ5+p?6uPv@rnoP~(r;=kqb*AF{cd$rLpvd?km zQb&@i6aKXDF=L5mc!}AC_7rb#vy(uN9YilYU^~(_!6k7{WOdGiQA}iiI5?P%LXh5H z(nk&cPRU*BY$NWB|Bz+)^|v*O%yMmt_6&P^^X|S9sl<(YRk|}Rqj|*p_OuOLoQhZI zkym@7jKZLT8G9^`a^2Q`}ZILs_e+sQps z&^{bg2s*|q2VW2lIXkegqa%-S9?Lh}yrqX} zCPDslu{^=Bz_TmO>SD}dRaMj3_^l8)OHa)9T5X$H`S->oH}{iE%)3jNg@(04j>@Uj zsG!UcEcWM2sq!=1d0Y2K0Z$Jb3RbQoyoPqw5W+y_4KzClGn2OLBN&1JQV?`PuC7~_ z)K#+fAUC3QrBv{h!-7_r64ww*N(CFVNG}M<%`1jXUq-ZrD28q3JcQGue^i=qdDmB~ zk!z)Bi>ov0T~+EP!j6qk zXixcp+Ew>yF8s``u6W=S}ZU`cZZ!T&DDwY}oQdw?womS8*Z385x<0TGl#G zTz($o!vxShri!iHHTaZ{eiXn=7%d#y<0 z**E|5AgFE4L)fk8n+pqT$Uz%2Xlv6O!sFc)J#0Vhs8X$V${UPwt7+udS>`6)ofF8{ zT4Ri7oYYSsm;6qDFxOx!+;Dzfg_YMSqJ#svih3_*Upsk*b8+>&j&1VgnPM}q*sn}K{Sfu^h2BQ- zNX+^sJ5JrhK5T3-tiVxD-oxh{s@RAL6Eu?Q9c@xDlt(-1zl`P?a!>8vS@q|xcf#aF#u0ASR$_YB z%h#qoPmJW14A0M&a)kS0waSb13X0dsnIq*y)emi{T2wpMpK6k8jf1JNEO(aLN@agI zjb8%x=uDsSx>#q2>S!6mYB%wkdBP`Ixxi|e%r(QNy;2&s@t(_2#5)LEX*yBa5PBz7 zvEU>0YFaCd^&-8bi(pJ?xBy9!c$iopa?PB<`5=7UrT&4NQuHSAf$9P7sd163A!KmJ z*GDMdTJ%i)#jMVAjhX}Ru=ai;jr~oQoCr_XFaOw`y1l49vul^7;336wEPF>Gy2A@% zb_4a}G=AUfT!E@+6}DaKaxr^%NAaICTz+ObryoF_NCecHQbhfWesXjJ+FT~tevUcg zo^*QnE@+13?iV%(#5D9fMY+0B7?Uf+V&dt`snO>&@a+hU`#6d1XN!ox>|V>&UadbX z1eUXeZGY|Fq|+My#*q;`+K-WcdN=4gFQS2i#}7VF436eD6^CX#F5U-F;ZkF%?f%sm zdqQ6U=h3@JggJ3EMA`+HY`fOnpGz3CqwwATdWw5wvVp@@=R(uvUSxUDk?g}4kbD)} zEWsz_)x}z~*L@z1VaTrvKImFVSPa1d&(LG8pRt%AuTsrZ z>o7jJvH%;HbQ5d{iCk(#{0B;-+=`Y0MfJNsY;ui!J_7IC%Gp=!U^5m7JmDm5YH-0$ z*`(;$vE8U?u$K2-i!x>$fqxvG1AP0zW8|fy&Z_?A{A;EJHnEzMOSeRnxI6Yp$li$g zr|ZN4REQvwb@=mbf7RxHfqCG60NpM}-)HlbN$=`?2X*AmTs50ixlg(0#0I5pBn+;Hp0KC5W5$kR`Vw54iCBdk*QJm z^{37=48Qw`2!;PKV8{$fV$~a`mF`LX zTbU2<>d(A@fX~F+!4MkCUv~c!%%COR(*1D&XB<;vVUZ<-#r$D*G4gf2=BTQ9KnsJr zXVu{hu2R64iIa;IaQ_bg(Tw0lBuMO3>+6FK*2=eM>h;IVD7QuRUm$h|xqxzX3h=|w ztxv@w;{FEz9{USOGX7N6lw+GpkuDg5s=md$PyaI43q{>OHE7*>3(0 zCi!ZWy}$VfF@?lTJ=ua(m=fsRhqjZZ@HW|35&JRHWIe7l!^EZ%mNo)iZt65mG!isR zM~+EpLMOwEZAV0piMt?>Be%qZ=~vxVBzMS?AI z{4S^4`?EWB2}Fj9HHW9FjoZ;TaA!-LoRr$zM>2gamX^DQ0VSF==XAQ@QVIXrrgato zzx}AS^>xPN<%S^sibTU_5;iTdDV+VJ`05t3-c2VdaHX2CmXfNaWk#iWm}~sZp@omQ zmw=cQTb2x+a=N1Gc&b2=?bJ+GtifG;e>iNxx;$d!Ys{{LKr3l^9Y*Ngds=uE$o`d8 z`r00m#rRqXXJqR;7E>Pu+mvDQ+SW4R9xuNH1V<1mgVt9w#k6zcV*R$amt(uw6Te-y zwntdDw$Ji+kZcJ%?H?Jr=8WjEti8Ham%0j-sQpHr@jlHr)sFE{tmWwpXaXxBL@{qb z?gqIQR$#Q#n$)oH&&;PNg+#yZea;fEf6r>MWXiXYEO+g{-^E?53O0#>b(e00~F>G0W^{R42kXxBdQ3>ko7{(Z?u_kD!~y9kI2Q871p?dSMdS(A$+>IS+D3pWP&`;|3=((|@hG-JZU-SP3x z-_5#=KS>%bsXg30^7ayPvBbjtojN?5%|6YK`7q(NGnG_r{mp^7ua@?VC_Y737S}*c zjeNI0F4p-zhvPjIAKh9%@$#5GJ_D&wIN}UjW>Z z^`36U!xv-a5`Lgqg-zWKPN@Qn(fF)9R9&WCV7~FV7y+V4!{5<+c-!~lFv5PmS*6jH zC2!BMI}X?Tm|}C*n^}MT^hN-Dq(I8o0OIjfb@1%j$7B^|Rwsx?D`pJg$Rc5WZtSW0RdY{J%*>|3&>$Ea2G+?0E zL+ZjoJD=3*L9kw`|4%n=J0z#JL7&K__^49Pmv!#*X}7)a;0BVF`=Bg2=-nEC2IFyR zsu@PZ!A$h!g}Ld$x~ugVjVrywQ*&aMySE?xU#Az%W!{X}T00?Q@(>8G%cZaDWQ&!6| zsBWmHHYcguwy3kjbKiAq%r@_d>cfJP>o1H*JO&kFqZz~o9!y-$wG`Ery+b@IN)?-B zHclQpI~Msg9z~9xN_`=3-iUSvP-%xbCCbS6Q!d`R4Q@y@4*0`bzpAcf@I(mTC6eNc zSzh)oP`V6*b>?_V3lCN1NQ=-)_=wr7cIxK1F~u+Hl86rgLyKYyfhcO8bkSe?{#+2< zhdjm><)0yO+F<-x7@YjgLote|3dT7%h4Sgm0?3JCe91>ohLp+K+0pnuH~BclCbPNO z{8#{T>{bOn-gSI&vbJdw*C?#xwR@Xh`~{dJ_h$@M+90)Y-lOV4Onr@W@*w63$E=;G z*W{UJ*h{|mOTrw_Fs<$1+y*A>Ubt4-mxcmucVLSZlDR(^vqj?%XinYebf2ga4BysU zNGG#XUb*XO+ptBe#}i2~G{Z315y~*8UyPBI6o!Y7W~JtjI2*6H+rP16@tB0|q)7S| zAI9&@u(oR$RPKR=F%0@>aBy_q(|mF$kF-*i5x)RtEKS^oXQ@#ofr$ zM-qC4#Ai!*GR#ADjBgQFSfdisg}&aw0hLzYK&{dZI#{q%oq4Qu!g#lyPATU{bftfyG%RJezsd{eS!_WhU29NXq^_rEMAgorolqL z^k^@xJj?eSn;;Q7w)~+-c>k4dn#EA=a{^lHbZgvi(1sA_Q**2uy)rS#htSm`5=+Y* zDKXFCoQ<+Vq%oc1!2JN{xAa%71ieP$^*uRxYmp8}w@|~!IY7l+CoKz2oo<}{ditVJ z8Ru-^u4eOiBh%3*mp8zK-_J$oxlu-Pwpd_e>GI!-9X&O{^U*N7j#_Q}3Z}XvFLg^O z;&k>5xNB*9^i5~iao%qQqhC(_tJK(H?v4CTZr(tNP*NkDX!J=V#{CtDYY{f_s$l2X zt5nV;zq228Qi0;Ya=nJV0&O#|#8rO-O%`+s5CXFYIuv<9*FF*$cySkis#`#_!{H|HZ#X^sUQXQZ^}leEPnLstw`p#L&s{!&@f zOHs<-`$W^<4+LDTX~E>Zjv;&L-a0o>Zx+#%p)W<$m!=7)w(wJyg)Fh8rvFCXbB)V7 zv!Uw@AI=D9ZX{(!*N>Sd<8Lw)Ts+ptBtrCBQ+}5*=y&^^CcqSoCZjeAT%Dx<0c8@6 zl)|S;QFt$T#^2+;f$f{Bzy++w!gAx_qy0&-f;JQO8;_@qj<;7dg2=cFry-v6b#)fS zyif^xT>#SR&=E8Vy%Nf5HRL!s=M-hvAGr1O89cJ)(l-`zA<67nS6f$R%Q`ul4I#J{ znw=;U*7sj7RI`?$x#%HX&FVHjJ;iCw!})9=5|6&hKecKTNo3f3sz4!&I6wB0dZ0v2 zO)ZhNcflc|{B^qcPtU2$+4WU(9i>~BM$&`tSqF+)bwjr?Ad0wzno}!1E!keIujsE# zPHAUuz4`1VL^h4ilUh4Z)o88Nhp#N0D<@!J|HP9tfhcsDeUn)dK(m>(RGt{FpNID} z5$5t`lbI8#vg|XOVLQ{rR<*2#8sNHc(hBqbthuIdLqU<{fpd|aS|bw^>m6YQ1cYh! zqs5?9*FBViO_N>!Dl~&L-?<}-$Ibsoq{LuPj?{k{(AHvI0&^D0a)JA-WkQ#1PKGXF zn=|#ZnUDRmNw&TO63(E7wE5zO!i+jto+>$<)nP9Z5tTbf$f|At6)!Xj=A`n@vP9x( zW+&5e!TG^gbbC{?-Gm&sn&%Cup(OfTdd((|M>l?&Wz@s9r5I;Mj9N?S_lk5(L>omt zJe!S&uS-uq4=RZb!}+LsxuDVMlsub{aG^^?gcS^ALnuc71-OI{KC&(kY3l5DF^7*I zT7Z#VTdb>7m^>f`TO|!pB(8E?LJi$A0+36;=sSD0VdTiDesdjtr46Jr2gM4KFJQ!| z>;r$Yj%P3MJF5nQ5B@shXo_IjG+3NWGyd=!ABtXXli(9Q1w8ljjQe z{n>6&g@S%1ISBC>4P(Q)kw4f{0U!~W<`WbEY>o6Jvr{#a0h(2pR8oi+Ng6Am>D8Ou z!e^xsbr>N0W>o4>F5#lDqf@$E=jd66M2RvGX18v44L0C(lK)7{lQB?fE_e5Ysp$?OkJn{;7V>922i7LK1U zU*Z_@K$KHN$YmX~Ma1Xyp=b36((k)bld23}o>l9A&?o;0^WDRskHZ_ZRKN2$LKOx- zz9SwQunT(GuyRXML#AK!WTs?hGRolSd@F~44d*Yswd|9OP+Q2J1EhCmQ}Q7yL;@8> zK_gE&U6@7LC~vmd52B4r&L;AmZSjsrkSa+OU4rT;Y56*$b?MI$*7i_JB#kQ^T=18% zbiLhuqGg_551g6HQut;1%AGD3J2beJJ7nd_S8J2E7g7OQjTm|<`@A!@EpI{pD8#JjW4VfigASn6pf$VBT zBwFsLYSN_`86#u|MAR6*?mS(y>}zSFCh28rPAQPzcCjMVMu}5e|#Mk@wkY< z+1%k_ky{4Sr?&tVJ$-5=-N|?3W<|ALVE*oHe8Q0Ksl+wb(U2hr;-M~~U?F#g9V5YS zxVv8gEtn10nl;mbsY!`|Lv<7OAeB|z;|FuZ|4fsesuBSoBZnbr3>vL&B4Rv;2$WVp zNg@16Nxg=xt`hZSw0s5;@P25HxPBb30lW?wj@Fd?IP;fGIYyua1{(DKAJp1;da?Pz zD&@)ib*eV!oKu|!1Ty!5;}LNFv>HOOewN{bdGCrk&x3RaJvK)AN?@a+Hyv3vql6fbTdfBo5P%k^;oJvG=(bf3aYapX?w~gAV`_%@Ydspb^RT*8C-3RoJ$v!k!Igy$r*r z=}#rtl6bbHruw?4=Cj%Y0IaAGR*(hjRsZf$ps3652n|7WtNbKdd?ns-4WsR+~1z8 zOawS*D!-Rf@S`bdJ|>Ah$?ZOm4s8EFQJ^99UnuZuw10ZAO4s#m1~le~N6w)N@i5iAV+J!gRv6BcUdzfho>)uum;P3RXnH~PWw z7Y@{WxI3i42By#k-V0CkKgIV~J4}s<`C)hoxJ%}wsy0B~7Tx8sP)RHcc0oMUTerKa zsx|4-@Cu2_ft6EfXs$nRU{s^+0!B(|z# z{$caJVXyy_$G-8MZ(|z~Iyzhnu7mCj{D-LuIRrOV62ilngYE8rU240#6yN4+FNTcH zkiIpt@%(Jzp5%?A*BLdHU!1?_;*b^aNMT$9TgYs9H42VM>SucfM#~%jLtNJhKtbIB zVAJnnr=rnqMHtrpBsp%0hq8Msr=ROnf_OyY1%|W1yK>oIC=e(04-{y4$h68tuKD(n zy7cy>bD{fQ1;NSz&%pHHX($=M)gG|8?Cb-eK$i1YO3_Pscg>Pppz_=$ND^lQzij*e zuJiN{imT?)F)oaB*4X7a+g$L=lk+2cc}3sEUFFZh-#L=(mk2k|!yE1(gIFihNB}hSRJ! zWr4ZPP|m`}&t0uewKZS_v!1Rg3qzUK3@nhMg}7TTczV znSJj6+^@Rfd9PTOVgT+ild{e;q+K+7`i>dn2|QqbnIUF3J_V zbhOn;+?MR$3|?jt>*v{yZC7k>F9GZ=1$Gg2DZ&I``Sj5r{Eh^{gBqlqT*NR!cTV*U(GVaWkl+iM*T-^g} z$my1&`nq}l{9%vpS|Qf|;=rT$zi?n@yx9wg(F1@b8MytAMx!ywMkcG7g3DB0XaT== zg`&deOsH&`-^dH%?4p@a$FG1O>8fjwhCv+CFmQjt1$Y3~Kj`g0Ad(C9zR2Y34J^o4 zljm6IUta@*=um>+o8FYF^*u8n#N;N8vrq7B>f=HPG1JSXJhU@fVZ`#ns?MAcQw`U1 zMz66AKd8|kZ#Ei2r8-#k=3>&l@l)*MS!(?qjW^UnB%pJ+QUAbe*7OhX z^h6ooUdud5tDyXy#QnizfdhD=DPyi0j0@N=3@5&I7ySG# zHa!1Z@DpQ`lm3Bn?JqN=!_0G)&Q8&a1JXQ2X`oFyMSP!fk>TR=dK@-y^J> zrw2O`^J%yuho=UNcNSu%-oyZ^jS~?l9(Y6O{U*e@y?(>0RJ>?4bj2P6jPHqtNDALx z-SyR4dkidr%XaGP(N^tn1UpSuD==S+(j->k3kf(4ihuMZ`j6(w&=1pUYn9J&8cTcW zOj*pME4})hlK>k!s(<`iaE>*g^sY})slJfeK;|`=rCQVY9ZT{zK|NqaM`~;vLBay? zHc`|kO%(F7_NtH?nU^oELl^5bCMTH`MG&=vvT4#JI^|Wz+e5f`ZN276>{S`YAfATB zYx(_1)~eqzKhlm%Sbhaz#UmQjqslgH4R|Ymy2Im{Y;L;AT(#O0zgi=`D~$ygVQ8<$u1|h93}g zV~Mmz>(2+@7HT|yeqlXU%BnR2kdyNYL43lofOn0yR+nN_Ct7aef%+kym&yL}WTiQ{ zrHH;-F-6EFK8Z=dHZwgq5SUgyR4J7AOd%72EuvGb%)&CGAxr^2n1^oMJI~S<3r=YP zL>2To9l?E&PQt!mWZXe6un%hb{jAYWn%%UMKyc9@V&J zvMp{BnjH~ukjj50G=*uh5HI`7p^1mvp7SCnRX51G0om&TEv@Uu5C)uq{O06e6>btB zs2(l-naun@QPU8JVf9oT5_Hf+2>)OF*)QDmn1Kit_@HKtP%-mfuV^HTtIf3^a+JK= zwNotDxr`&3k7ofq?(C3Kuf`=7oRkM!TEYL{{MoQR@}uqoa}j$Oe1FAo@BzUkXnlJk z$b_2k3pM?3md~CsB;+H=u*(G!{K=u!DVzO~>XgjVehC%7ds<_68DKy=Z+t5FU6sss zz|Q61;F*$FBOw#7O7!vz+&8yDWGP)J=z*-z!K#ENVzr`+8ym}{u0qH>yXprIjLUOL~qh;g|V2F@uQ0R)G;8@ ze1l-pw?Q&?B4e{3oeh$gLK$kCP!@P!dK}{~75TBLZdpl zm1WmA-t)bXzfGn)hx~AYGot3zoe2wRl0cI6Xc=_r;>gDo@6cjz=i=s!j+dh!bzRk5 ztuvZVPEys^zzBSZVt%6sjIt+jo~56h+xwW&yF2d#b(9HFD4&(33M7ZrJ#?JR3o(Ua z>inqZjTk{Ct!`pVQHcjHRTndpfwyHtXkvWLOLi5As8ON!;rd*(5`svk2Qo52dFz{8 zIuF8Z{(9mhiahtF&fXPANhij9ihLgk{HrvUYFgS_6tz9^+CH_^yM6K}>VholiJ01F zp(}QNes-oSPJKZDhr^TrVnDC}&Wmo{IqV6m${iQ{CC;t~+F z=}RR&gpIdc5i@fpAz&zO%=GF>ja9qZawHwLzLud^sPkLt_Ld9&w2V$2CMK*XL&c+0pOcoT5jo@Ese&HHvChXy{HR z8iD8tl|bO69HoHU`}9_w!&Cq6y)HtDnnQ0*fgm zKt{Tgf&(pwCl%gxY5DScs=DPKN_Hw1ylo(^Z+27C)wLH;e5Mj6L$k9PJ?h%e+q&@C zp0$T2y6ry#$Z(+Kc&MGTIP@6j!HXSK`^PXk9QP7@&1YkuWnNwu{m?bT6np%v-sUE4 z%40$@T>~z>S0gzR+L?f<=kEKx_>COZp{rJ8`G_26 z6RTopY7@Guo_d7ll{Yk8JviR@!URr1FU8Az?X0S*1p54&)ZxQ+_SI8)@(L=%7RM|P zQZKj5%S7n&E_D}&4)$>H!JrwGPiz6}$!w0yNu&$XI${SoNu~!T-9w(%C>)em5m}G) zkrXJo(r#glhw}Nczt;HjI~l^s4q$zo@1g<`CuT6n49nUb%<+~HL_7?DVq&O3Q1N4u z18`7WIJooV1B%Q|ezd}CAKym|t*xue&Rc1ln``T8)>^=P+aMFQOSy4t06CP8T^vTH zAZvvb%z)s+(O2I(f`|~(%73zc=V!$iN)-qq!Y|nc&lL)Hwqpm7@g;koh!>I%vjaGJ zgg;Ln;|Wp&iJ%R3K9-BzmqIff_Fv-R!#seTll&jWe0Wdt=us&)!+t*c6rf%TEfp**x7Y@|yF%9@(ia$C2;6KB+LP~p5FsJ)&7?FlWzu*v_>>mo#AM@y2^l?XWMJos%eLN zEw=yh&_>^ju$2{|ND5C@H?hvni0aYNek~1Q_;PGsSM&0Vm+zqwJ7%7kxwrU@ek*6O zh)UM7$A^dRE~z;Yu%&X`S@mNaDQ_nyy6z^mDj9*2$^##rDNPp)TNii#^qBo#%i(7H zr5MTM>stSmw2Ta)mun;rKz`DM)XfiaFlj`mr$puCNFR`IE-Vn+q<*#7_|V$QM))8~ zhm2UZ;6Tge?v5ukq@kfs!K2(a41zXYF?R?3rHZC~PeiCfTue<(hpv5yGC2LOJ`NYG zsVu18t!gtJf>l?O@9!t<%p1N=^!dX;1Qj{YwI#@1x14Xfb;s82T#pnb3V?2#=7L)H z%-R!na$8l^#g4Go8ba$Ohle94bJMhHC53s8n)j#8gy=gst;c#ze)!y_bUWTAKSs~d z;Q}7$X4Zr&qY(@$lxYJ05(G8nV-SszVPs`pmT9rp>gbL?(RpG#xV4$w^@fwHJVp3S zhLp6(g!e1pcePkLsbp9pf)U;q4b(2%cggDLr1y3==u!}cKpw5Fo3P{tZ?H+g=n#IV z`Xx9Wk7?QLyBqgJosUK>XBNL0$ziF2?)$jKm9@3~?|FTF2L!4Ni(m#kieL#)jHPeK-Qz^yQTXee5*fE1J!^)Lq6Z0DyW#B_oN@!Cf@_j7FR zL?zu=W9Y9;PYKG9Bt8qqcWwqT_6;kSTifsHRaunL9>Ju2;}aQiHO19tP1X)MIE>+6 zL&~5lk?#9C5fP(@AI$oE-nqpf{^ZSD$aS*B7V-6h(knqMFfY_N-xe2-Qc|uFvFgc0 zM*3!|TW+fk537@t6{DgyfA6YyoT}P;Cu+dS?tQKKweuCy|FI8EMEmk=q@>R4{(~n{ zG{u=OtmL}#=jISYX@yZkhJ6PMT025=y9h`s#@+?`=Qu zF#0ZI*j&MNfrl?J=4V+_wY@iK*DoX7hrahsYyV4cxm^0lL& z(ixwa%b<$H)A-!-b-0#rO6rVId?&X4MsBF>olmiE{ttlR(I12!+`h#PB^i7$IL?;H-iuyGgB{Vwk<413)C?@iB zo9u*Miy$7ICn6Q9@?lmL96z+**B7qN9XAB_S1=} z6bxmHx-iVCzWR;fU$A`_Gf%B9p9!xP4)o5ATmei} zZAc{%SN@P$QsnRlli2CpJb}M|vP|N~j~<>W3hqD!64+XF)A=lpw>6LBLRco z8FC?HYmqA5eiH+`^I!OER%UF)ev>ghLWZ*^bliZ9V9x?Ilz8LTJ8FixF9;ph>@-Dp z41ySxcBSaPwZ8s_+troe^kE;ij~-U5dJOG&41;Y?ib>zszpdIF^9hINvY;Oh``U3W zxOgtJx?)(rJId)g84zEty}P=$_Usw!+Dw??_bMCdlr&%L=S$!qET(S|n7kn<_>gE; z!c$e@6wNB}s27n);_oI)Xf(k9TYj*g@O;Pvv7@qUNg(a%Yba%}IP88J0u1?YHw<(U zq+q3=Jkhk;{E#MOXlZ3t5!H%spEEjcpUrT;Z|YN4E|5yCar2`idq?e1vTSKR3hcJ` zc;c@J+3dl4x&nHPbaAS>!(<2zn<|Zc@jRYskCyW5kW{!_!neM>f`Z+7!`N7^iu_et z&k8+om^#eZ{w$1)%uJ2*&F#?`apC3vPiQ7r`Uf<_*8dMQ+kR;DKTFYn2OZast7KSV zWb>ECV1fp5mo923JXaDoaXs7#B~0=BM4eV`Do@I`5FO1S^P|;5t|Th-s;bc89Ujv+ z1TooAsA-wxNrP@$VOd#{*7t=D#)d|*#DY(s-^SAO3Os{U#VX&VS`DfbL8?_VhLHow zP4e{`T$$M$8PWj&Tzq_Wv}_^|H(8RiqbZcaO&vZm;RiT6>+kF}wz>~yv_^&qq8Jyh zj6RE>1v%e9(%&p06mw;Z!z(Wp3!pEdvT$u?SqwQ1NVw4K*3^iQP=7(5K4H{ONctO+ zP6~fZ_S4)Y2^i<$_W1?dq*lV`2ZD7kNu=RM$ zy~E%~e<2a;<-w=rAP?X!cUd1j=NUdT71@!i5d<^ThfJ;_dkg4 z8o2I3EMui;LZQ_?v-4GH&+E5%h8o}?U7r?9dFa2AEnrPNLzno(JP&?kIB1xKWQna$ zc$t$%tx--CT=BF3xtR}Uz1Ta%_a*H|jvkCttIV4Nqsg(*@ zSC1#$E61u$*b^yC?O16!jOvh}kEMn;ZhEOzqA^hu1v`>GJ$L)I9|TdSpzEWB!M}*{ zNRNJjMRgy0D;QhU%A3r2~Lv#2y!^42-u!9)S4YYO|E(0E=t1^w&6#CvD8enpW1y}Ecl(yQz3>72D?3Z zB2M(|)+5se3O+{#VhKHqSj!pAb+${{0@)N&^7z*2)ANUqQ~1MO>#{xMf~il}mm2M^ z%AAUn#9`;5LbD}=H}GKA}lW(qinV@CSWsl?d=T)vswf!)#5DSMG`wRXHOpvxdJQ(M%?|((LwBF?vU?e z=Q_}#ryZE|Owaw=6-#NNZz4cTE5^*qs≫hVy`cZK-}&N)XY7G%Ymo`E&kCE4lr& z<$kicS?BdPTaPmE-ja6++0Kgnf7P93Sd?4%?h!7uf6rQso^NoN zKi6b?SJW-CfSg9E_VF?fXMo^5j?O+|hd<(ZcQ}o` z-QIEto0i5?xFSk*^;d%cK`&@+k)1KnI_}o~7^7%mB0 zPy%D-Ehi+lV7E1VW^sP1!3Q6JL$Z$^-S_fh({6-7{8Jy7aDR&>SHMw&77yFp8Kf_@ zEV^#p){!XT5X)-$eO?Mk?+DNFn~m3C9d55YfX!kcT?$qP{jHSXB)h|5%DU(j0?U<` z5@4?I;B&vk?B}6tVb#7edidIItVi2BX3u77v~^CoMg>}LCA40@d8DOKDTN=FS@h+D zrV13ZK^w+gx&D#oXuqqzG!+c4(Qd?dC+mq8BkJm$MJmYVibF89>>A6*!4E6Y!F&z$1(m)XD7=8iXh?My}jy6P5ahen%C+jp?SfL(+{-3@<0!A_L5{H1NqrmSceJ?Zq2pN6eTkUZ z5XyhM+_z3nS=n=?*5GUqxyTb~^&y>>|Kc;w6Hxmoe1~m}sGZ?z~ z!p&@#&P7D+&$=$<>Qt>64#;BkeddY8w76@?=e`!k#iiC}^DHA$^Il(1|6KSfHB-mk zwPL>EGL5;T{6V;zi5H{c!RmQq+Phw<(9iv}Mie!}P&yky29=VX>B1TtnBwx8w7``0|xA z93fivgyUi0$vXth&&}oFu#1Vs%CM<^iH@xZCa5!OF??AH31Uu2(Bn5S6XTCMFDggo zM2ar9=RDv~6$q+!GfWmtb*S~$V%Nc@vmvc{bv9cbk?xIEUJDKLGBC5Wf~~om>Ty!v zYl`320%gdPwtJ~wj|@CKX40h`Szm{Jozpb*lft%eC~i4}`V|x<6pkOcB0R>d=r5z# zsbeWI#cqD;&7t>#HwArp!%l1VDpzl~+y24$d*{j7ehg5?QSgX}l&7WLHy#U0N+Qcq z+zXwZphZ*PE!t-Ga4o!-5BAX_X9!k-rdSV;A3H@H@(3=(g5j6yb|qb%@Q8uKW;<_> z>d3FpX|HNK4Qg1dmKq>>bYVPuVEKG1TuvBRmyPe$)zKn-^%-+}nqGb06e+v6yiCt` z2Jdh-kI_BbqXG9r@UYj-lX-uymF4UIT3$Xe^+9vt*M*W&il0?4P2;P@r`Qocy2}LxXJGd3G1(x$A z=q0$=HE^8Z%xT2Ll2h>Qz*!<)Z-;i+0lrdJpdq@deSRgz*S0D8y|1`*)e!AX(lD!)RH2jK(}yJnJi zT&yw{M+%?d4z6K}XrLy;Qc{XDT99Zl3m)dMt~y*CwUui$r!B_8r~|@xt*I zOb1Ixcgnfu=Oy9vFT5FRa4z|`Rgi?ir?6G-dc9T7CZ>_D0uxO^2e7}7t5ZqqSNzf& zw>Q4yGE4~XkjlZD!|Jos;kt68Q^@MHDK)7x4sne(YPNDUEHlUH0sEcCKA1Rvi8yCT zS-Er3ti46~>y08NY38VNwnEXDG((y)+yoe#$}~^PQ**H}5v|^VM|vLzbUP#Pi3F-G z)@@22Iy156R$f&_FcmO;d6AdNmF8AUJGb0D$hge2f^;0gQj(YDY?1T`LpP)C>I;dC zHpd%`kgc~6K|yiRpbkpamx14sC*SXI?C2Ykwif!xEDn={<0s|xoE+EN#mg*#QnU+E zG$~f1jQ$o!oW-Pv-PFmjgi$urAVWDghz}?K{Gq;t0Yb)FHZs8zq?+ee+fyA}K_%zq zt$?Wt>@6&8yVr$YBNT4=X?zrOgC!+L8NQ;>SC~ zp!Exo$Po^V(jCnz2aB>L18_aaN9%=8B=UC z|BNa4?x%7XbXD3ra?*@AMl4hOstUjpSy;|SsL3f9(l`}uF?CP^tQIw?rebN&Im#f5 z=eHH~YC{fJ@|j~hRHp^#jBZl+W+ox!4_9WaOV5#ZrrQpd&U3xkF0Jo*jRkh8`*Hrd zr5gt8zNk_<1JO+EA3Wyr#Dcsg+?oSgUc0@1HQ*|?zclA)8n?YgH*8#=+rWWETm(*! zLNH0%JFs&JDCM}3hbKW8{TV6Q>)%vL^eqn;wXF!63dWp$u@Uug$-+9W+NPmho5gi~ z-I19&)^QoeZm!3h(fP5Co?x!};rInGNV4K_~|V2USg&A$`5*Gsy*2GAX(OJN4QN%r56SWG-2b!{H*u$F4V)z{iMIwdF3BuI1UTPUs}3sZ$QCd=zEzK z^3@JR0R1Zc0m-+eM4p&x_DPLTuxPPxraqx$oum=Ta`gDdrYPdoOCW;D87y$vBb_hP zUSM)`r=SV&7v1&SgG z`xM3Yn}30y-Hb{4Ge|LuVldQQSknfzdJa%Kdlp$tbiy%~TrPTjG0|^cytZdD+~wc7 zGa>rQkd>0qnbQCrd~zQ+`ZcE0@}=9=@ZlrMT4KSL=-7AQKQ$2q^BW=8Rt zrSY$OBdZNnGICFxDXpvp*2{__)JKxqT1oAFQ0o24jnAp>-@bx6X)^Y1blOLktoESU zh-Y3k>Cg7;+{1MGv{)#at;hI@*~3gON$EB^icW}(Tq9(HVLxGO7T}8$M7hREIzu6H zM)Xo8^(thG&(_kVoP2`VGTOpOM*k*nuG&=i2dBpq)wC>+Wc2%Z54YSBy|?^5S1Q8y zr{@x6UyVAiyGG}gpq+dHO%41r=hQJ!XwFRxWzaXYpt_zlMEIQn_YqT8v4MjIcD<7qZ*j*x0fN?FqYxop}2P$cF zP)VDf5=!2C{;Ih-W_PzDD%q?hyD24i=3q;C*lQ1~*c?jvDYYM@%C|EY=JR^#6st`R zb}bP32AD%8_v9>Jz_aoTt9Cv?-EU+hAbij$7yG+g7i43i_TN-ia(w$rrt?7!I++Rr z;gH<3px6@o-Zw%>uT<)vpQ{&AF75gL`09VSOy^EM;zN@3P)fU%1+qA?Bdrli4}nYd zw}=crN6-?_&Ar~)sd{Xys~ZnOg1CN4E=kgap39kT6?Wdm(8n#|)Lq{`V|zKjO9b8=%GI~Z02gP!fi`QISdUgvV@R{ z9bV~XxsR_}Mo({!P;A=v|9pMNIPNvTQF3 z(?F(m#o_>&Iwv2Oj~7yHZBsKdgTry1AFg8aDw&iyeHa^yg@l8E2RpNOhgtCXrZ7U% zzyNG~Xh2+56-m@BkNJ)A@Zn|~y?g;UQ2rUYD@42UrU^<(cmUdV8 z#POo-sNou;JLS1xH)tvO*b%F(LTr*k4^UV=J_CahD=M<)DAkgG&Unh)bjrDEMKxpY5nW&f9J-M0p zeS8n{Y1hpw1)PnGab7TNRbohojHBehdai5dL z8=;G&J}ntr^ZhFsYj%s?TB->9#HoL~#CxCt#4~eMEIO-%$H~~_!p~%^icT&YnT&B3 z#(6S|wEUl?VsHABD;95bCAnWMTHEzGE!WWufH!x$j_gbsxE@$U&x7@MqkeWT{eq?n zM|q}sWRi`2HTy1irOw6lNj$b3ALT_h0i#K-DgM|+BfdEB*v%zW+xXZtN*Fqv;|1C3 zeoQPWs!Yx&B?QHl#OqsP{L#&t;)VQL)t7H;}xiK zWeQo60Rfp~K#jgviTAMaX%p4YGGvW4^r8rXrzcuFzp|(w*B%JCU%zsTkq81&RG2S{TnpGqvX(EtmS+hzT535JfhzLV|z@s;#zllA)H2?*UsFb%dGUkD!cMGCJl#mV(mq4U_UG z@wbsvmQqKb1)5f!X_b7S>M^<`nfk=^C%HtPZkvOQ>^x~_424535fvf7P%rXL&KE4M zZKQg4w!jQqnu$P#SVp!u3Pw*$ARM;1CLf^wT8Dq-CI1&PdaXKxR#B_i-O=jqbh%8} zVG2Bw>kE-v5crmB;`3T#z7(?3MT)wu&OXEZTBPsJK6fF8%~YX8*o93*#QEqjAt@=4 zhljm;?uQEgLN6ut+`?ju8Qbm!B>RZceR3gQLv(l6%zct#mE09t%8dKO^ibWYq+iz;k>(`kkg7|lWcF!g3yB2ih zB%^p4gQ@j_b9#04T|+A0H43gdu=GqZ=I4)31`3@y7Qa%QQXW=d0|+SunNi{DuL~_y5k#9!wiYcWgHHHtOB-E4CfH(w#G_Yy})Af9gf{G^^x zVAw+@Nshj+(NG9{?U%a=$3P4PLOhfcCh(=|rCHN^^yf-APvw`M0VxWUgowuo3qu^_ zwS})wH-sca=$aP+NABgbILA*a9wqih!>j3k@@ze-+~VRa!0B1c^cOQ08|g>I25;@} zu0S0_JZC@z+}^&{aO+=y6Yi`y{;x%CnP>&rQ($UbpBZi6GgDSt$b)yb#@*6RUmPmW zd-^pQ13936mu|^1##w^j*E&{G`0aOr%4-f)WMr#-68fRzPE2_Cqs4ygVv<;lW7gy*#C30zq@%-?TwqOKDqmcYH}g>NDp^1}?xO zS$JtJf`_<)|Bs%SbT2#2>>NhDMuKyCOf!1V+&*!#ghz^7dLhWSxeA7cWFfOlb6~gG z9htK3f$G>IA0-x+$5Hc2G;7)FsT>de0Sk5I@Y(Ni!0fx?Gpz+)N?E#%yp2W(#v0f9 zxA!%_gR{9}*lK$pxalwonw)m;k$5WJ)RbacCgeR@juCmx9Ey>V=FmSAPc`!SW_BB($=x21P`KbNU~W z$UXVT%<3oa#wt5Fzh1j zDTFp99M`FOQm?zijr^2Nte9;2fkZ_`Qfz;C?1xN6LrgR-5eJcD@Ax&lx;3E~c#q#0 zzf<16xv{a)m9%qVi3V-VR_-06PqxU$##~@3KO&uH@mnitZmt%f^6peVylSzt|2SFB zEDYS$fln$t9IH}FA48&|f{wYS&pYnDB1mye^hO znz8W+!6_3e%;&YwLrmPI_BaCOD~!Yb7H(0yv@t@xdg(-|4(j*&>1OCPUM(~=F#in{+6X`Oc%MqXvjF^#

d*HLMALN;HftG9OiS&=MMZ=oe&i=I?VQu^ zpmUJ~5&qfq5I?`%zCKPN*E$KNz7cAw=4m7-bdTmQP$)C-N_v#!=oK&DZqax6tN16> zd7@3!X+V=Q$ezdFQw_KPQ@(2zM6gcIp|%J4$} zZF}Bv1{0X>w}Z{-yE=s`svt~%8Rn0lI`%qj`>NR()iT(_tMBppbkBzm)`NpWZS~!b z<2mdVC`kSs#SF$L2oi@O$kwxk1wKZ3a{JP2_-rr3z#&@2p5mgbSf_m_u2f9CME<5l zMkL=+3{BSLAn1YbPH6H>R`=&Pa&fn906xp6Y+_+BQs4d$O3tD8OO0*PQTREneJS{^ zPjn#FzWND1fw8PbpPQc_B3YfE*4>&E#7sEz@CHVn{#!~z1=tm?VB5hj=aoszxwCLp z8x*O`@z=0tZ~|vij~?^!vFP!3oTWIajQ*Ms{Nd>DkhpUBYgd;PW%(K=3ioS|2u!(0 zn~Vkm6ooa24#MVK4jKW3I{WSWmJwv8I{>ec)bFCV74^V}0NkK4tP^DU&-L>O&{aU& z(KhrIOR6+HEO48k-z+O`;Qt?V)<070C-U^8g90#2uB01}4_99)`SNHIqR+BH%i-CqeL%qgl9wC%|x(<+3LM27D|(ipFn zaK6MQV*dm?R`7pKV99Wi-wBD=j~;76p|g*rrRS-`9@j38j5pG!7qZ|BpYYrAx(1Hx zY3}QH1zB%XSF8nJO;poz5nB@z15s&&_s0)V-dx%j@1kNJurlsYvERqqKXLN@eovf- z2}V&|-$O$k%Q(T*eqFc1pJ4#deyWDJVVew=fu9+zGHUkLqoVAjafMHoaLPbv8+{LY zFc}}o`V{h0CI2O;dVu;_{FWyf+54^3J6N)sO`reS6s<_hH5kxSZ(25$=}~@I255r6 z(fm*SA+e}BR&!cd2K&Y7@NxQ7mU0^^2;QBNObPS#sJcGgxTGH>Tm~>jqM76k++&)eC!esKo9%_tOwlaSz0cmu ztw{I~qpkQ)V)|cmZ%Y~!%1A`UQ21~chQy_bBewx0 zo4%lM{QMa>LRY2r2E3B>QD09T?TBgjw1B77)q25-yZ!jSW2?`c4Hz_?J<>KBR&wtN zzKkCc0vf5fW)$+pK+RE_90~iyLPR@T^?@A)H8q4kunHZUVeSBB7k&PZsCpLwTkX!M z&k-S{(+^9_6&(k~bz|zb4oG-X@WcRs!>sK!>?-*ai;y{dK|b~(V^6+!1h}AZAL*Yo zIq@%=oKDTfZ<15Md^ZS5&YU*tk?Jamh35v>^rs^UOqBp8hB!U7lEBG#@i&SbgIIDN zTyLB(eGZTAJs|CDz!z3&L-nQld@ux#c3fLSu1bKT{?i_t5O!I7K0SR4U@k$Ef!>!z zpObk$!v%b_=9A#fxty=%6|QW0++QPh-Ij{h9HcU6f^AX`>G$Y&6dy5ecGxvst%hu! zP)C?zc#pVmSc;l^iY)Hqj${)hX|OG&v#qy4Np}KR(({v9Y-yb z0&L*~2*i?6v<-gq0q8=5vY_yH$;)28h4pO1x7HHJE#O31+OpX+=Bv(8TN?i*w)QBK zSn4Ec%f0*1;j-IV}Z+D@+m^4zAn zognb6R@Hw?EDfkr+$^gJzm8z%E?Ob^v0xxU8*fF0wf21ZQu`jN+sTABLX<)LKKkvk zyCI_iZ5P&TA4N9=+%pVuoXuxAT_{eVHKKQL>X`m!ZZ!`y*$x&==tl_k=i0H#B#C&b z*bQxGZ+Q6XMI`GK$JE3W$)zH=CMtpaMMMJaD)J&o04BP=md#@LxMgk1wPXDp!lq5G zJ&Tkzu|fr^Di1PUn~`<{0OUOrf?4C%J^?!yNkYsjNe9zRy@HO>K22e=F!LDV zqFQc-k?!CVm*CH-OB*ziJxTjbTe(-&%18uv<`s=fQb{J2o7|EGSV&=E7oh#%Z-{s# zzm2{cf1;tGu|4LCJv%XIN0}BT_X%~x^qyRf#k@mg$Qp6S@c#Z#d;3(92d0^NFt5-a zg&|)6xO3o&+Z9;9yotM5di@MOC<}!1;<@ajcz!r!B`t&#=tM?LVKYulxDZ$F69eS0 zVaFyZwRG<~t%~7c@ICV>E2)BUlKv>LU4zm~JMH}b4+fOS9dAjMpc(7p4Kj9y4+|&mTU>D~m0Nsb~FIiTxkbkKMdSbmTEhSHmwi2%O#@;~z$#Ug82}Foe zmQ-{RTx5KAS8+ zvQ`2T1$3rL4pvQ>VwS9=Srx5r5k0I^y2Mcb@FQ3QPRy6kKi2>;BKGsVK=BtU)}Hi6 zA!tO>>8YgG75B43`W?7UsV`Pa{7hrru8jQ?#$L2Nw;giIs6$06iQ_ai^l~PupEZn3 zOsR+n_10;{I~yvc^=u# z;jszTRpHPy^hS>Z*6N>{=+CU~6P7BV8bIR$qfkqzqLod4sC~?+{7n~^6z}=hwe(6! zb-VLrvXKvv#2(8xI5W7r=NJv!=Mxg~!5P=Rb;8KTHhN_j2BDb`@jl?3&IZ869tzx8 zAGjFZ0AP{Y!(ET)=#I3@`~Tso{*e?q=o#J8H~~&e{InoBM~Mk~*TV}>LXP2>*;Fp> zf8~WdviA1qm#a!jU%ldRNjN`*j0$@@Dfn04Jw+D-(5eGqTiz3`0u=^rVQe96#?v4C?@cUzK5G}!g? zKkx40^z}7jlBW0%eFPc4$Y<8-At!NW)yzNsJN%602qw_Qh^&r zs`D`fp&Wwz`0v2*a`bhs z($bZ=rK__`A|IshXlQ_QX0Lt5+hr0FiOzifrt!Gnj_jX)JB^#$M1m~RB{1{|4nHRZ zCRj>_Um^ujT4kk*yV@_SDU$w(ftot{gEW6Nun*^=NVAHAsMXW#+D0S7%gTfkqn#`G zSqR*FMJZCCw3o@^*>*<2WSJuT&F6W>sct@4+U!-3NV9I=XYCw&e>8tu0ZCPrv}4jU zl6Bef<+GLkVv<=FisH7pD|_$KBv(}jUN$mpGk%pFUjj{ZOlV1_!ROIafD@ZzJzpYc zB2OK1GXiY9@T;wk=C`)GX*uGVKcrDcU3*6WH(oVN45Tq(m#1$|J78QyqMfgA00qR} zOk8>z))^-HJJ!i~mBbUOWJgi*;jW$R7J3H)^FP9)xv2eSdC*(EYqa61dX4c|0KTqp z6%W1we|Y`(jBrC+vZ2&3!HV&$8po}Gf8^hB8(l6B_ZGf|apq<4#GdrgnNz)EBuPwH zx6((C(uTohODppP?%Pou3m^}UO{pmF^;;ykw3!;RyO59Gpv9`KR^7@*OwLkcaf_^q zS%H+EsL1=YAqtwpMjV0Nfe;KqvWhB`D$VX|oJ2}LP{jxg2iN$dNP*`JA1Bw2g@=sy z5iM3^86j1>Ht^QV6bzPc5>w0HxwD1k-gaLr92e4!H7+*@7C7PxY9>Qok0Ux3^x#4` zaZBWZ@TQI;(E9?`mOS#JDwDG@ouzH>KOg6cesGIo)*}N3!=qEvFI-iJut&J0vJ>v90C`kUC4pr?iJw|6Xcwz;vw!hcUe0T~nd~U%N4NE%6&f<~0+GsK z5(=x4SB0?MCs5BVGRB&g2}kH6U8yQC$uwv#521*T|H%}>dGr?6_?(HmnyKS0VAkxw z5GuLKB$VcMDvN;}=W4WE0x4&of(ENH1S_Zen01MF-;<0yw`9MB(ji*2d$z+JrL;jm14F?2$x5yCcQdS{)t`qg z5dP$je)W_;)uxim_vrql5}hD;cjluupr{wkR+|RzegB@wkVs-_+pnSqBJ8|{QoysM z*I6}tch_-I5F4tM%!SJ=XY~TWuD$War>!aEVgwAM>(u%+Z{rja@~z)LThK0q#{P7m zPABiI2h!8u1HyBOx%177W-?GUf!#}^66{9w6F?uhhX*}vXB69dZ|p%D7jj!Gx^tRq z+Q&l&E@I+P0Ho347W#jDvYHDpcFutuL6(2o>sSVf0_3!0MhkU{<1zNSb?>~5LZDK# zlM@{8aDW37PU(*kTQd(^{L>OCSj1-}sR)U_qk`i8mxe`BVYDcz0g`~z#@_dJHzjfC&x5==w&S#ceK088 zcBWz@J&7AQV%Q9Jd*s<^_n0=sC-49)_Uy)%FX##byDOl8-4%QjP{dyx|HGNM{%glu zB%sUYv$w8`oCUiis1PFep9e=NQT#uA!L;w}$zK&;eFFcC4&LUa8#oFI`8LEVP|$67 z%+?*hkcJA;2Ae`48NTD6uW5)Q=21^bwc{6_VKEqj%@%M$Z|?EWPgKsp@~}Zb3jDs~ z7cMZpbaVEB^N=zeZ>5idB7qJ%_rM27gJB>4g7~ZBi#Yl3;3y@If8Glmn&C@Edi(%mr(F^C8#-RTh04bnLX(k0DMLpKcF z1Ml*_ysziFpXYtPAK#B}`}o7Q);4Rc^E}oO`?2rG`l75TjgL!#i-CcGFDvs}6$9hW zF$MK;?uushzwoz$HUgA66FBk3(aHZ<7m&l})oD^g80 zSC8rzv8yr>gMq)G7aw2W-CuSVp7nCkuC+%+FFlq1o^a_SQMxg=+qd18f`5|xpedK6A5B~cA<16JqFI)S6eOb^JJx4J6 z2Fu@fzVkqor7^1zBNY?g=JsDtbbNG1wTtopms7T9RE;u!mK@}N6G1*nSf4#}3mUZ5 zg8Fkh7+COoiS>AAoHMEtEI8I*rex<%h>PdVZqyeuR<{jtyt~6mIgdl z=oWTU<1CY(Z(VfBzuZ8N>h~u0K>yy_Jj-AE`#=f&zw|>c5bS~UI)6Zea>6PzvZim# zQO3Tw>$i^a*L!Qh|Gn83l;dv`EWLo6@R5D`5%<%mO1@qpy)vV@aU!p0QROPSH z7gluO2|bamdsm7of1FtG!+-4=u>8EZ`#BYzU0vs`!@5w>5kayz8gvlidhni{I7?){ zoRpUh=3g)EKh|_c)#8(i0y6~@$$8wUJc(7wWqxtd_`v%X198%~z^y7#@id}f_HatK zsc)&@K9|{?>HXUjPgcODM)q^XSba1u4(1jxId-mW3loVq{Dg;{jb1&WbV|W>V~a2H zzXb<%89s}|W6sz}C{%Ia%R8zkRyYZ8^;b3QJ9Z-d^OAu*&8!JSg9Pqkwwfwj9^8Jl ziAbc%jMAjK3Q|;B+&>pxmH+H{F$m{ED#l-iz4gp|$yqY`LbVXKJ9p0j$<;}j<~rOQ|) z7%uL6P4SoK*7{{$g$Gm(W!tC;bm}rz{Ce>zR8r|+F}8i&z%S)jVG^AFo#s2t%(=Ya zZsxa&bwdB;NxLS~s6-mkOp%}gIeys(2YlA*n|KWh2XCG(-usl}iml)LthncB}F`06hDozYtV+`8NG*&hV$pq z;C!Mq2RX?t5AO`-CIg2l$>=DyDN#tu!;d@*1$3mfyroRxkurbrS zAuA=qKF8NB|1ENK_bJ<>m(*6XXx7&6V`sRWW7ls2kLV$nDp_9}mu%b!OH?s4^1a2= z=pO~DlSm7!H9%6n)6a|>NEoZH9o+WT?xppv3!>$gBGZY}G>dUQ7_c4cL=pu|M2+vh zAI)<<=9*WxDSIKB|Zb1)X$?-2E{k&p+vz78Y_4|HIgGwR(Ew}2J+UAu=C4Q zU^X&Jzve`k^AU>kj|-GSx%0Ds6+D-SOHH4GpIOD4P5(Rf3iY7gf!|q%E$j#5tVWDu zdEZU`*;qL%yDb(hqoiG%@=LNSk5u)uxNe${K)*5515i_Abi*&npZD>2mN(t3riw=L z$E`bR)FL4|@hT2|qVdMuoNwo!AEC&NHARIJc$_)R%N}_MoWy+U>Z(hOxUhvir!8$U z5??LMfaX1m+qYe2B#4cS?uNLLPlZZ$zdcUu2ySf8I>^eNGF7;^?kp|I({f69DSPX$ zYW>^eE#+1-*7;J2uV7;;*!ep$9fdjP zvXcv1AGyZHu?LwV2}JVhLGi&o6T})tX<*lE+LeqvITLC_ncVi6)yU*SODro)WWV&B zyAUiU9_(FAD<=CyQ+FyWf>v13&Ge2^X^a4hlZ7(d%TeFQdG?UeV*gL(Cwa06CXPx? z4f9U2)34WDK6GWi?_x{k?n>>Bi5JwW$wm53DUqo5p3hVq_v7}!akB4wVLmd_s3`bgeoj=+_a`=4i)PKFD;`#}BwU z8kGl+1qJbshn81%lQ@gVUJ|P$PAeX<*kqz*Ac#!HIiN3vSSt1jJbhcd?Eq`9rH+OT@a~ym zy4bP54Stiifk}&B$;EFTJt59F{S1e6<-tFSb!Q-wTcNCAPDeE}jAPYx4zV+;)^Efk za&TgDTl@!vabhGX0`e@sMQ}8)f20!`%fCx^zqp{TcABqOmW{*%Cn*@>sGyn~!_=ra zS3Wj{qjml)zp zX1x!>OP*;H>pzb+HPuaf!Wy2(X*VhlzHy%0cl&OReobPXKWIBX*YM(A`}Z8QOnDC3 zi&IWDqJYM>cYe8WYa}x3(;5zP;m5x)@ds9+pQthoJmy~wW*^vE!%o?P!-R8q=rlhD z25{3L9@Wllc{EI%m{%jN19k!@ZJf*7al9-EL1JEx4?stQAtjS$q)Ma}^?m)z=!vX% zZ*TLKcC=T&Z}i5>Zj5gyV?DWJYQy&5te&e3kAYLi7is&dZ7;}s@e4*s#Bqwo^hOG7 zTj`)TFCL=Ntl!FX08gIsS{uu)k@=P^Z|w$@mC-p7+Peiz#bX9H^c<*P&ayiMiv9u-&WLGY;tm5#YgxQ`Bga?Ub!@JS|f{wp7MR#*&2{NPl9A8GuaD8$*wufEAN9KWL7*lB z=lfm1b}0=~6ry0K8UVY$k-nhgXCyZ(rTJ)Rivro?%?9d8=3JNf&s4_t6Mpj>W2#A^ zbO-H2CTbVE-uoPb_jSs{f7x;bi~Z=CN&Eg8_h&Dd;f{bDgk23lI9iGNPdR<{BI=TC zV~k5awDWO?XxnZdFChxnNl5;~*K-}=k}2kI+)`g$7Z7l8i`LyZP$aKA5er3wWZ|1LbS0)@}9Ur2dOoX-pK)chVv6fX5Xvri@=)zyk{nmrNpI zg2RC`Fkdp4io?fj7S0r(NUUXTz)vj%$J*0JW`#4GJzvZ}OkWpG1Ug$aij9lsm6ee4_66x1nVXY>!s?8-xnHkh zvOoV=o*nu|h}NAF(H8RLU92R~|NU;MaG*GjX{~D($@{?bRwTg``jI3=` zJd6_252pj)ol%`+V3vl4MGo5b*%>{yEPo~HWf~R$DNmd?*ROd-y2A7)r#kV&GkM9P zdE;ossR`98yk|^zd1Qxrv#d;BdAfhdef7*!q)D@QjM2q}zVcwvxvlIH)L;;R8y8CgE=XORHy6-<2 z9iSyCQ%jyGeouMaaxnzqqx>huM^MWk$QB_42kc4o6DORzN$4XGjfwUO8xKdFt`L`68qB*MW0&U zB-UHpueR;f&aL6Kwo&bQg3sB*Dr8Cr$4NA~4;NPiuEGImpg9|Y$oUjoN4qMkv|T}| z9?-!uP#5`wZ>Hmj$sNNgq}O2Hl@~~c|1ej@Qfo1OZpolm=Bc#2`;qMf1WV5xC*E+# zpz_LjJ;_sL>TL67Zt&8#nhP9J7e4MU`4aXbqAqU<1D5kUn)jI?Jw0L}jK`;u68aq= zZaRyrug6>BPwKC_vh~7Fic1s>zX`9Q+HcDCe+8dnqracaKzva@*sT+e?m!ttJ5d;* zfA`@`^~mxQ&imiF1e@8AsIs4*>I~Zqkik1PZ1LX4oweMc$6MsOg8{Q?EVdfV?&ao; zX7(^x{m(!MH_k{oQ}C>m`)8VeZtWpFrf}%F1xpjj_c|}dx?Xw*cdO*DNss0?a{5M; ziQ|$|7)e5|#E6*@br1^uytG`%Oc;5U=4pFD6K=l8`-)@@NzBP0#Th7<$yPbC^H_E3 zD5E*2QLaiebL6tV_ZVA5oL!!U)Sx-7a^vV&c*%>>(!^UQ=N(h1f?`_5*77hoQ?0^9 zwmjB~i$G!E6;J%};WLITQ`TOOy5679wpJ~V13w+H1-$mY8u!l>T!d1N&5nu)k|UJn zYPUu4^Ccr|Kvr7jCCTctOl1z&1X_Mbr~7|#K=asdPC=_BJudiny`JEM1_->;gWIo) zFkyIJ(D-%_n!yfebY!lHqs+N^L|bZ=`uxTEwh+ix#f%zeN|l)Ogs7JLM}(D&pZTG2 z`3unHEA5|T#)k2j6IU#rD&d%aXcnzK5TP2cgl=_+=x2M?{n<_{Ljp^~j$WUhp4r~H zFHnnx_~p0R@Z7~8F?}ahEYGtnWa(%Vq$U;Uc%<5*&siIU39W~skZoM zgar6BpCO1-pn%vDZR8w*9p&I=C*rfE5-c5t8eLyv0I7YCbN(X4E4lGk2+b4&`l%m zFKqhx7q;iSwsAVoNyWNVo5z_7Ec~rY2N7j^{nx&`B9Mb|f9G2Ur_sYxqo2E!S7$%_ zs%GjXmEyQ@4dS^-&4Zr+gxT*hS^M%Knn0IH6x?!mdWpC?Pgs{I3T@DjE`F?L5us|- zOW~A0nydfbuOfzRaXT_I`ID%*eZntE`g>7H7Ge9>Hd45u9HyBICy9?KA4VbSd;H!A z9n8S6I@aV)L zn!2Ab=}A8#4uzk}eXpPSz(G>_OJ zSLqOZ&V8Jj4HAHgULCG9u-v(<%BS96HB6cC6;gjj6rNaR?%Hrs{H=8&t=@fpb5^3G zN$P-QY^ro(J4AjUPdlS;4LxPfbSRKk2R-UQ_t0KrC69&P*Fot~M``uQo?xaRA|Ws#q41w$N;uC0%o?T-tNZI#)TGATuD zjw^q_>>0DRYaK(>BTQ*&GJE2ZDz?<+st-bL?Z~Roup^q6cl@Z?_%p{S1N-&(GhAZE zRTi{aj(?FC#y9dhj%;j!qg5p~$E?@;G+a7PIo8&WH@@qS_1+<{(q3B+s}*igVu#v% zc(?x3nI63M1WJOV&+jJVGUft(6!VmuVxm*)cwtIj|)Mo`{ z;uUb`;;M4PQB~@sZ%8bt@7+0cFn-@k8>eLM(nPP+HtzXF4ap(n(Y@p2E6eWi&pgdb zAc4gdPV20pt!`en`9rJZ)bGwpA99V~SLm#uZ}iFc6^%D>LTcE3EYcHRo$MlYlhv7# z>BidM%Qw-qCfDx-^B~3c$Au;0P_y`1(m*!}?=Zii?O?dFx z+Wa}yWBcMHQPah%ewejZ0rB$b@sy8^IGNn0sPXD%O+CMjL2)ZhgJO>93Y5RhM;00M zEDC5;ML>6;Yi3--qRG&D{6B26@@u#WDem}!72A8I0+v2;t|6B`NfQ|!{ov=9uRM*W zx}J&gMl!Cc28Kao0Xo>r&LJbWYdcV;AHNPe`VEA=qU=#rkC(u~qw`<{7NltFgs4}i z>9ym^WA=YW0RaTVcgl5rz5g=qqxax!7!9)Asm4?A3#+>G| z6r$Ce>wkAZOrG9m^CJJ1AP6`O`$avXX-PuppZV!MBfWQKC0u}^BzU03LdrsK94D1f zICSB~B(u28mK~9zXL|Zk4S)iL6ESSqQ;EqjTWcJHW9ybv>)Bii1#&XI&n}y0O#>~w zh#07vg;bZVQUqVEVu>lJ#acZHe8sYB z08jp)BqrBS&-s`+l6a9e0RS;*QDWhosuqt_XJ%#={e0Xb=d~3=L+1oJ8D9B|=O3b9 zJ_a+RMy2EuCCZ=tp;ZA=dXn%DMOrcs{`$Dsz!yXtCqTjMoDUH#S61f#nf5L3Tf%+4 z+xpw>jUy?4KqvD*)ER&xdcpih#C!5D+39c26(iR8&jm04Kfb`kzmD=Wwbos_g|qy) zi#d3ha-9&{tyH7@%H@&LrOA(e&}nFFdY1KNYcVf{wX_E8>f-Knces9&nD2;z-wWGi z;m+1Q;s{c_e1}fLgHu<|jru82!p;E zDBS}A!OS**daWFlrGJPn!M{>uUONG;$=Bmzd|1rs>07&@$M|sh-&Ww?{MG47YP$#N zOMLgN{f0i|bt|5hL-@^J;m$~|tDWjv-A2k0dG?r?XdP|x(USbCXA_zFP6?|&X2Fi3 zc&B@Pc0&Zg%|%3r11f|qX5lvbqy1|ko#RZOeq+tIg~}bsi?cW!*-i)X#9wtzB~q9B z!!VSa1GP%W>6iu%G}jeyfDd?p;w6Xk4=K?PS^guP)8cC_d5!ySDZ?W8=d&Mmtb(15 zg6&Xv=7@wyjaga687wcnx94nn3tR$kVC0z;z8ctW8=FIpG;+Z!c1On$(~3{7R^4EJ zbB>!j(-aT=QteAi0Kqc*w}Pw(ln-T*|4`<_@yXy7<^!Az%ivFIB{O7~*Q1Y*K>lAl zQEW9xl%5w@Nj=i#45lqz;GU%lO$ zJ+?>vtlV4j(Z={(J!>bCGuSXbP&GO3UJ)=+{kPY)2P%!$LV@Mzt)qRQ788$6VyzWw z93B_LM1LaNisz&&S~0zoHQ0TbIaD%vi#!~ZrR?V>CC2$9?z1kfH196Q-BPf@x6<|k ziWVFt#9iXk7G#waRMJs$?pP-Cbo-$H6=y(^PqEyR2;W}h4^%BF&=}M{-2EsQ^N(@te|QrHQyg)*E^hE&8n_7- zC3J0aMVFs7{`6RVTigvHN$a5A6#ZlVeLEmV@9{u`7Mb0m*Tp2^_GO?Mx}D=TtekPA zS{vsVt^0V6O|{5-FV8j5W-FZQeoH%jGbaj^(xo;k8}6L#C%X!3k^GKYbA6MgDDQC-YzOy8u%k zDspGx<`NykjkbOrJL|X}n(}r$$G|O^XD63Qt|j+~&u(1o)va!YA*oEo8+FCKb7F+- zO2oy`8Y6G^QwO{^LzP3K#Cb2^l8P@SDEUnN3NNXgWi3a_s91dc1ysMVC`lyoNV20E zY?g~iDYtOM7e68qju%r`VixWSRK=#}F#8cwG=I5+ute$fJ!X>w^uTUN&8c3v_VM-m zXa-ubt-Y$r2?`CGv#o;@e%5d+q;l6z4+v~n6Ihi4^?wsMZ&oy z28(g-MU3z?&5{6ba_#V(_>Gw)=rlbf>(#R6PFYYTO)M+^d1&GsdiqEVpgflQXJhX|zwpg5M`OzW&nP6u!C>+I$(o zbB{r%F~xH^4IlTguLy4ws+&)z;m6jL&Nb3M-E@?EybGk4T56j~N=4pXhz_EjG(c|p z;Q|2aebLU-EC#=^7`ycMnO&i}u6e(&i|)Pjiecrl9p9{q2@1Y_bq!~uK~x*e?nNd| zho5`ZqPZ$vN@)`Kid8dgnpkj3W-gsZGSG%$g%8$8P`TL^9Y;l^6w2>`krX+>KS7A` zLG|8MxY}P+DSfu$E}6)|ULEy9t`d{XRQv*F`bAm*YPi#l2O{%TxQ{M&<;N+Bsd9a z2x~&Xi*xHE`EPbUYZsnUHF=29|0*?1={PP3+`@kzrm}ZNm0CVs`39%7WBeW-^+@we zWCwpEu@buy^B%|-C2={n=>xPcPbQj;;w-RKfhmTwM#_9Y{}X)ODf@s@HgM|#l&&y! zN9|m0klm!Y0N&8(C56(@B!iFGbgqjpXi2|@WxI`j5Tg@`ZQ^h4V8$^lUOT5Mp4*!q z&twTZX)`E>&Ujclm&OC#X+q&i<7^Y;L(nsEcAJiB6PhDi^5q>UH;2=Y<_hcL64gj# zR*AxF)?x*CX>CWC{o%c-u{@6x1JOD;fTHuAakR7z3N-T(!z&`VQrzeWM7|_^01PKN z2m-S!ACOMC01Fr%atV{R(gt{{jw`cx#Tn1D!&OP|kt1pY*xvjXSPC1W@^A|MQ^WV# zwheXl-Mb+z;WCk$9j@CLAi9Ub-;^R!Os&eYLvQX-jCvfd-Jxr!%GaS45*J1%km$nG zgc5T$@PSzh>KQfKH3i*DSWZ}4=(@O9fAeT)QA7}2YM=S=XGt#9;vj()j-KC}UDn<2 zeswENfWjnZBo0E+tn=c^a2RQOhQqP#CpS2rK7)C-nos%`<% zqky(1vC%5o8`?ID>%|GVjcEX_gb8|r?;adZ`R-0tc}0YZE*(!wM;T?L5unaH>wR{e zHzGOu$0(Z4*=~+O5e^*^50e+q_7BqTwF79AI)@{Lw{OXgR<_rEA{P!-HG&yxwrJ#WhY>3(dO46Tu9JtWzyuK+T%vq<4gYlicH_k9 z5%{r{uU%8!;CL=HcWr>%MYp(kA#D`g*>#cuem)9{ieuk_TbR233>X?41W?S{o&V`0 z^Wg955P0aiRJGvO`V)X3jj0#dKxXBXHhi^UZRT;fB0R*dv>7he-Z9kL-8*!pnJ65a zz!T8KCl z6s{zj)@wm!Z=1`GI+^*kQ$@OQRH3Xvs#(GOGr)Ox}@wKiw>yz3VK`uJ18;>_jPa zj<-)C>%0yzK1*0|?OvyGI6wI$?C2=wfM!(W^FC;>Jx|RB^)=(Qs?$m!!z{)7d!8RK zVm0^Nu`u-T&!}=S{x#2+h*ortkG5F8<5`y!LAnaQ?R|an=&?JjkE1C|Pi+Zk_E^me{L>Jpl>xCD4Z*PgrSjD@|Gkxi>{a!A$D z67Py2W=N1T@Gb5?&}Ymd8nP*-uyniW3B*Jb*zs8(W9Q3Q&c>_GgE=}FqR06Q)Z<|T zCC5X|1|NqHU7aE9?0K6oy=l&LiO6M^UT?Y=^!rTHew^CDjP=Fy zf+J&^F9qMUSO&`tRkFKKT;|d@^SH1ShLRa&)j~eCkMQ(7mjmv1?NL;M6Z3k!e6S8?5m#FrZBjc4)T>i z9Dx8l0Y_Vi*)5Y>C0jt+cG}-(rHtkAe3NJOB7%QK8l_N~13j)FPzF8>sH zUm^y&dYIM@c$l|>PlvriL{;`>>PtfhB`4@@Lp*~i)E-%`iQ%0hwW*lL(4Zd%@V1Ax z7ck1xrOZI@`PJtMvh8yx&%t1L@>rpnPbCv{O#?u&%E#W8^X>t8PZds`x&-sbLo*;I6?0iw+2pDx4OUDumVy244=0aFBQ zUBH-8jdP{CpsLmOPSo-xta%9uE<(~;qP`C4&FPA2-Gz4gHR=)WjhHo%?n>BXPIB*1 zcuR~%MU5u%UY}psaAsItTsLk5yd=1u|FY3gm$G{t)+pk1om0Q?qai2U>3#o8W)sv3 z8I52a#7Bzb_dKX0KR{++0;8j9_o0kphXZl?l5#yL+iBlY>7&4e6A+`rEV9ut@I?#j zHI6phBCL*>_o`TF`xJyNa^07cF|Y`>b~PZH*Bw)YyMPY{KF`}SS|)uKJXMaXaj$j- zT=IR}fzaz7u9Y^FP-Mc^t$``1cV~d^YQ$q>g{}TEd_n=#hEC1q^lY+lS?Zg8kOTYV z_lvU?pl*1_#HFY|0j-%_x{$1Fu;cz# za{dui0#^g`rRKZBV08b(A#dnC(e77%^HA@_);P-Qn~NkdXs$MV=$>p%Zu3a(r<~R% zE;V1_hZ@m!n+1V7K zES#>RE2_CB3$^=Vre{>suGi)n5D+t6&=qIbH0)}yab#g4%pbaT)$|EK!FUHJLf5mm zFI^BfN71p%rI{VgzXvw{kpY3;Qu0R7>wL~cU3Pqn#6{A5Ymc7Y_%(D zM3fG0YJEwxhYtf)&efd}PUpvOi|{O_Ael2-E!ny?Cby+q1Z!pUc0vIQ3OgQCyZCJTdQx4o{Lh8#D-BmTyPT`bQaqZHNx(Y- zC&;Iz@a(F5WZ{g^`X)aFCzHxdd8Hyw-{oHLHk*Z(q1Y7F8p0o9k;PPCER!z7R4KIl z8d!iW1aRzb)?-iz!PjZ`>+vBS1jA>bYsCyQlj9cvrcn-eo+vBhQq)}_xOWq39iBcruc)2 zlL)*my~(8`G2srp?WF+>X>@g$&gNs|3}**}-e(1Rnzg2(T*+d&?ZzxZ!;a~tuH#9O z1==5`OHynm5CUJ`xgrlX&#seRDub6zTTKZj9fPy!>Nc!*&H=xc?duZvqzku=!9={u zKv=kRQM+^;o)Xo2UxFiRE~^KnzR$+ssU{+6Pgimkq!U3h;?eRVW!efMl%~y8sm}S@ zsB<^^Lj#H!y@rh(_$DLfPnEEi?C&6a2j#eWh?P(Mqc`8vLb8QlNgU zaGZr@d+%5%I>g&l;{0ulY`JtvXMHdI+h5O~v(zCZxud{|W*c2@^DB7i&#;+;fKi@W zian4bFb6x~hXmwd=3!@b62@!3e47ymY&W*+iZ&DgZ{xL=!HMV01!*R3>o2tbYov~zqiLlObVM-p3A;ZiG71J5Vhri&U!et>i&ET>i+Bb-a@gd>PAAG^F0LCs7 zYjV(U6X7o=yeV67lnX!5I-^RU6_X}8A8>diB6#im;&M>}^!u%wK;A%u$e*_DlJLHZ zC}v{okURTC6K7pqce^4J0faAJH&tpJlG)K4r!?}c=PjWGqtX&bOeB1p*`y@23Rzyw za(`T)xQt9Hl*x$l!C}RZ&>~tdw&$lL*t|NOS9@HmyPZb*&DsrVO-Ceju8F3kCXhZZ zC3(LpV{x6nJ)Ychd!B{|2G^Mys5S(tTF1Xamg!GRu|{$Z8{MAS7FF;_SDJpgq{b^Z zjIx~t*Pyfrb!r?oR(8#O&djf3Qt_YV;_s>xxnVJ20DDrjS_Cd^t)C~yIWc@OSmcc4DEZpjqXJip{bDm}Ses~0Wy%bkdeF|)f6XZaSIPCwPzH>d-) zMkPzR*=W-|mv{gWMl?<11@xQPavz?=NzNzV2th=P;eBeTNlA(vZgW zD>kLN*n1ysZY-0(^$qf%*p_yT9-6nuF$=)k?&r~N;N35LD`)OoM7->fdT2(i*;Y&4 zvBVYAm8d%i?4^?GQp?+S8&D4nQ? zRg3hpix`V}*FHL>y3wmF?MqnAV|$eX{$*jKacPIYYO8IQe^ z+Gtya?;~yDn?;&d04=6Qf6d<(up4R{eQVI{J34ZttDo#GT=wpbpl4ckR^>p>&OXY_tt@#$kS+r(2gIYpzW(4?0yPCCcTZD(7|t2ll> zXBwk8L?ju3BnY;Q%PDk>jxFp(K-mFES`xa!=zw&#J?G!Rrlk-Yx>mP*nUpXnf6aXH z4(~VA9evy#X)!4>^2AsAT`_ZMpcP_6-s#KEO-;ISq;G3KtJ&>+i;2_4)r<8#5J&r9 zN`!ZldD>gHVM*Tm?0Ewxn&~jMy7@!>P%m(hCJJ zV&_8RGkBZVc$9e{?`=q(VW|B3S9H2$-zA`3aRgpW-g$pj&c%L3g9lgM|C z#7;UXwi-jBi%I53XH}v1O*Y4?492L|9JGHrmap&e#^0)_v@MDlAmtNKr-BecDdxt! zrEy(+SEr3QdZ6vUL;0x#VRV3^RCQxcTBo&)RTj}KRJv;DGg44Fcw zj6S7o|KvPf*U7=_M#m6phy36=kI)C3huMYbt*THD#>r);01UvRFM%^J zJCYg9R=ElWTD!O){Q|{`@9r*5cxbZ5Y9|t#*MDtqVWs54`q&j1&}sAvq+Qx-&XJZc zfdB^m*tafpM}xW5NOR4n_{&{PR%wBDlZ&|waT2AS-&+DwngxCEFkrUEsg;Sy?)`P2qH{26>r@wfmmG);$ z88oIJOXXY7X6*ck^Cejm_MLS;3eoWDz3|*lq|vQxAm;-LnZ91s14u2i%x_{x^F=PW?^A^;2njU=flJv`M zE&C4dvt!O_-^)gu7?D8*WF?BpY!_|k3-4GW-)3t5nK4C(c*j9+jr=a$4jCW)Iuoz9 zdTdY3Z|9i1=upuHe1RzkgXNY-y}8+|zZkZci2p>Q@$C%!_3(@u2a-LjqVjn6$m3}h}5th5!!N&Mtj17p} zHxy5LuGSE=4QU? zc)ZQgJk7z?V=N!hkmCM!8sAQj8`Zh|u+=lr_kpF|tr<^6-N`JufPE^;aeVNL$h*KN zq)tK%b6B5{PW*B)NTZK00HG*xY@99U0|_s~HNaB5;Qrk$VKTl$)QuS1YZ=|cebQ?Z zbsu2ni2ae>UG*-IwX)~~5QuPgXUf+{`7q>P9Y*}X}vnS)_NQd;PJ)+EGN^fz?f_W-J zvpxV^8u)z7 zew#=QTcAKfEi$Txl3|Zi>pnX>AgTxXQ$J-g2W+|^hKF#W74W-9|N8EMd+8(U;U?G@omv@jV;7d82vP zXQSem6df|$<7f}G2MYgB+8H1CIEoB664Z#fC|JP*1V2gX6w2{y!XYRtg?;|;7&}Jl z;cs>k&64T@Y>I1tB7pUj8gz82RmBo7;d|YflA;^OqQS$%Q(_1=C^ZmEdh5=7#)ezbhA zN5kXzrwqIe-DmjRKQOH4xL$j=m%sV)==~c$l?v5+2)0R3L z#x-Bvj?S$W>ud&$BHm%BtdrD(Nf#YWLXIZ(%Cy`~Y&)Z%WK|<J} z|AM&4nRoU8#whNd!5$xjUS-TL7S~ElpspACX?eb!t(m3ZRN7FGlgd$s<;aTUXp6-9 zK}Eg=H9q$q86JtH?@J10)#A-fKHrJs)EKJW-=^5j?tHzOS_ z+lN9*!Ipe^Z`dWk0d5}w;|n~wM4(U+)YF;l=Xqn+nVFeOi;G>y$TB0z^MjQnQ+ao4 z>dU1oUSA*GY;VM%);2IC1sytWb@Ob?Pn3q}P$3@R1t=88Qch$Tbxy^#x8A&?{ui+R zPZApYPG@KN!?!_N>(<(X`ubbKD*#A*az<5w1p2+4(v(KoxP#TqfOD994Qg<+Hl1ziE74+$oT9dQvp`{7x&z^xsD&ly)2&v`c?wvdkr#w|jM-HtY0Ye!orN)+pU0j=Rhm6AuSm;b+ z<+ff=_!&QGAJSztLZ9)r_sf;{$cJ{D__jJD$dNGE{7kVIeusCx{S=OmuAuP=n_jKs z_GEcm5aG>Qnr~ls^%Jp)t)(S6@Xyc<(EZLw3sB_*`y<*#eFHR$Z+6 z$OI!qNv77W_?~Nw#qu)2#>_Vr(=&8d4D8=cXl`4*R6-Bn7#5+gGd;3*jjRJdEK-IZrOI;yO|mBzI|<(O zSe{V5oj+-m5m8hNVAqvPSqo$8eM3h%Vk6W_;_PG8FVW1m^wyfitz81gQ|b|pF~R-! zVj-KF4$zgYjb@9BHJxJjnOY~$CMXcL66m9mKzzz-TLjUA2QQ-CQm!Pu<)(&&4ZXU| zI42ST!1AzSygKwg4;JxPgxG`W9XydieVS^EWF}=n)}1mQthMCPFC6oR_M(}9h6a&{(VoQhKXl&;X@Sx zb~RzhNp!cQXNQDr3f#&NkPDY*yGE?w=#tP9;`l?&biNn?w z2iO39IQ-_SY~dFnC1F(|Id`>wn>#=0h#^%f;ebJITwdt39SX&(9tr z;^nfJb8LmYJ@+2-=7#T{o_`kS)PHo)XIeN4$hp(q^I+in0=)mJC<5*^L}>|I$j3jo zKAuBND+Ry#$nxX5O=pa1^xH|6;VR-YoY$`PsjmZJHIf`rtWYesqQ>hYe&MqbBU^ldN|n;^n2ashl;27Xraq?YtaMOjWJ{DqeFUIUoi3jOIA1sW zMMYa=RLt}W8nqa$N~)AJneAj3lIb;QKz`j%^8-5PW&U>m*m0=|m=674H)?cvexR>A zI7=~+vG5`1|8Vx!aZ#<^`!I@%f`Edek|I)4Dj>oTA`+4c(xuWZIpm-qAd-TVlyrCJ zNI7(O4c#zw4E5fF=Q$7O9MAK8KR^HC4`J_p@B3csTGw^0wK=gXB-IZA=b-6nci(Bc z(17@Y+lAdqfywBXBSRlO&N=c!r;}GqZoxOkmuRUjuTTey0kPfPxI0A+hVYaqlU7IX zWyGLbc9Wc3zhMt@m6LDGExIitwvC5!rKggdS^ZYpf$6ZuH!J)Dr`GjV#y{n_;WZ4T3lsO}6Rh07-H7?Sm` zS@%SU0#I~jum&-Tt9+5kR{R6ZV>8EcqZ@gg_wo*j6lLe+ejS;ImFX>#rQ>NA&YH6P zOywLkL)}(AO9jb=z7pMpbuz-JQHPb(qVPMaL0sR}1ADF0O!#cx99DHvAMsP-4pe-c zf|-yOC99h(9LNJzIYX6psRV|2{hdnho{%G7`XV7vREy*3yN(8mw|_j(Woe1_P3qP6 z;nhMa_#}ESkzFNH;{9K@?+Pl*4>B+B`8ZA@1LYMJ2hj#d+i|`v_~JJg#D)i5?u-jL^U3eqlA=ul z53kkH3~b2p@rfS=)a>m`d8}`cvPhqS%c0+&ygT8gGqT-yY$~(NV@0BG#u6cTf7ju% z$DyCk0Ba}CNoF+%ZL*|o6$|#VuMQ`;uP&8m}sLxi+P&w$v;rn zV5|q*3^#|Z^gulPFQ1Ar6a!x<5eC$}c=dSz6`!CZ70ENZG(c8a88tpm9Y{||@aOpSI1cQQq@5DI{b+RUSFubn+K>DgOBvF$UrF-a{!o-HYeSC!GE3-j0SzKt;;8 zQVwB=Cy3|)j&9`xX&-5cvj?fwD2yRI`}^ZVX%!WP1n~;?hq(}I!>9Z!qfQU++}T_i zk=mYe>c~7Qw<8<1t&yF!b#RDJqRQSsdF)llKOZ|59-4DhKj2;v%RonDvG#N=Whr8q zNzu&s&_sMk*E%}9@;UNqNT!BiZ#HuY3$0~^hCdBGRiXZ{)$zI=t@@V<*ENV{=k! zsLFkZmEpFZekiS-5{)CH`z$ObE$_Ip(Z-~qg`}Y zK&IH5IVcN#VyL{y2D5~`L{ygX2=gB9QB;gq93Kv6yvz`emdU2a1177)_zBFb|`S=>i0gl~a<#{`1P`Cu;2 zPb0P(Op3`i=t^|hS=mFQ-90>HqIkm&oP=j{OZmZT0D{kn`jtZkb`k^F!Pm&RRuH22 zcF27BuPEV2Q++=h33#ZsqRMWhtkl|mWPX0S8y;6%G@P1`Z-1<+q*;j)BE1=)?xZVw zbKsNbz}({E-kn*r9}<)#k|H7^Nz##_9A!4kxi72Jwjspy(h=->zC}gQ8h8Az&d%0I zh-e`pC9hJNLWZI_Pue0je3BekhfcI%P`aOS1BUS@IbRZ9nef4)9O)}J{qNs%7E^pC z_BxMLe(5j~pz(3H15&soe}7d6QzKBT9{Jq(@IzH$!bNGLnAcdS?%6 zxMX8xq%6bPKRzL0q}(ph_#6&SxM;i5K1FVlN`YZzW#x>wwx4ijf4`E9j7)&L@m)H) z(z3D}b4H=n5bK3}$32V0C%Q7vp4E<&K{Yqg<>yZA;~c6D=k?^b(fYrJiakl8S4|+f zbpBz^r#-RCfX-aY5p5#iwvi4DOAGdNBkGNc;PrX&V^;3kz^b)k`EZDyh-p;#tMsle z!^l>I)QbPioI#ww3}`tFIQtymgx(#=IP$goAiXIB<#>EJ(UZ1gjO5QV>eID}nA0gm z=j2?vdbNVb+A!Cs&lu?JD5ye)3a9x*t+)5(nMpk`rp08C1ISm@4Hxi9tv2Ug(vi1K zH-+qPFFT+{D?WX?xE|7=Q}}3Wp=W=k48@GAO-w;iVZUV<@P&`s>sE*5d}qSjn}7tL zWd8fh5$DTl8)?sa?&Di?SLk{1b>**RxT&H! zeAgA~XO_9yjU)*R(K*EHBV5|(#;iEVs6#{{%CH%95e@`0DaOf*)w)rrTa67*dPnTo z=i551ppS*27x7ke)Jabc#sNXwT|w3Y{Ri6!D&)~z-1x7B=;(;f(5RT5>p0&M-d89P z5()Eo@NjFf4_ROol9h!jx7!%Coy7k+pN)xIDhc92q>qg2CFei;_|PglIyy>6^GjLe z=-^vEz#$;7-=zp(rlF}p)BDTtlL%SgbmM@o46jWh;I$RD)PD&GAX>i|gFcz3;}qpI z-2Mh6)5}{Mn*mK9&+&$m*ffbI86{ z%Au}4^_bp;@?mXlwzj*PR+Xj?EjU<2X+u6WYq>DjH@DCr9m7(~p~s`8@7@aJybvSv ziL&riNuuB1$mh|7at{Q?aQLj`+!xI&q>^cs72(Q~k5jv~Asz4l=#-2huzYRNIB}hQ zu8gqES9^90Vi^Dnao2to`X)pas6FEis{HVYO$fw>obb{b7tZjTYDzfp2S}cC=%|A5 zmK_qXYwLpDqF!dBv%R|;YHVO{Upj!|Fd4L(s{f)bBPn^AJ!By%8iwAT^i#}KyG21^ z!rHq8?YB(F4ZkW9V7<`Ya0lzfgJ)K{ietj4x~8a$uq3g6>?JPH8vvXS9hC}op|#yF zG~2?o?4!!a1kBpz_lVKyp!IIqlWFCl=aV1wnt@()`27~!GqKou*JX2()X;b3O15ia zkQo9>Rmd{)pxnt>Q}4_8pBw^S()P2}_H7JT?5Y*m%*;hiA5<{67QaIdn9Q2l*VFRS z$k1JFqd+=zy0o}{>kF26hB~T9fIXXQ*`;quRk*Uc7qUXX&SUKEIo3kZl%&n z4;YDL+|p$xlR-DzsO{#|koCU7;M^oDJn-xKPVEsiu&Xetg4^j8w3CA{>{&`L?%qN0 z%Qv$tw6*-w4n?9Hnby&)#TPK)OB$YM*b^n_${&6|M$-+qYHE^DfSJy@EmD>A$hbSa zg}CCKqVq&8)l9SU&eJc@BZth{T?$XcA>V0QRVpres zu#-jA$I5Hb0v5gB3aAJMY*u<(L>p)Zd!3D{X6z>S&)G849p}{`g&xGnOVfC9Y({G{lpGLs#a21Xgc4L=Dm zE{~YDs6JC@zoox^$V}|DyQkR&xVxcXn*Getp@W9i2hQmzg{(#+pI95sa&`yAnz|gK zdDQM{5|_8>(EG2vCSpmT2PHnrx<+Vpan!M+YNLo)?udMcRE`QjHK_+}ZRq(VT7iS? z$22tIAyk@nYZF;Z5!_ou>P}SAh2=MtA2ltgfw_J*GEc?3(IH+OOv54PdF*LkdmavFb$>*v)7TQU=8+xGm?yxxCnDzK*Uw_<|3GYLsTqPFEl>c5QZlXUyWeo)t;-ZmxUa z_-J?I=tC6(6-Kq^+8#Y~ytA^^CjD)0qpCyB8FElpn2&JIO|WXg?%*HQ*qf$8?eRQ~w8kEN{>^eu5<^dea8(WJ z%+>Vu8Ps3bX|m8J)9o@8hshogsE^kX_QWv4Ue!p{L|8bFLBF10sjm0PFMxl)0^V-% z18i*9He*I9ANZ`thAxW#KzJaD-g&1>G#b6!>MR3WjbnrJAnoT1k}k6>D+V~{Z4l1; zIonP&C;3RJI+soiZ8HhjEskhrlsdZ@R)_2K92c<8?`5F)citZzL1vfIAHq6Ud&1lJ zEIt^?unnt+Zrg^^eL_ymj2sIcbvW#;=}4JS9Ir>yLSRt`Df87hqS6(62Q)I3YqN@q z2&CP{rW8sZZSQbA^qGoM0J;_wc|D$BPcb*CsI94q37AiyRE}2bPbFlBs~yZL2Ko^D zk`hCB=a1Eo8cEl)pfk?PWsKU5UQJq5Z6ilX1!>}hPP&zyTZ{3>^2gST@IwPs8QgKO z51}Vjl^L|dET-n@v}1)?=bs0SHq`sCke!9?k_}U@5xx|Hf`Y0=rYtNh3G=qh8%#zV z-zFv|ii;oH^rE*GwcfsERV#kdn9`NA0NvelVwiH7N-Hfb-Q3(9$kPi9Kpr3M`1trJ zM0V^BQ_0xPiG$%bSYrl1P-5VRPXIv}-JD4zVOdy>SV*UDBKj!|r7 zc~K9Xx`MY_akymgS5_8{4icA9Y{wtVI!m?*9drrUE^yl%QIUhYC20aQT0{b;cap$#scy!x0JP}`ncx61R-zo z`h6etSV?nvWlzBQz(Eg++VuZ{9nED5n~4eU4g|;WSp0Z7WfO^laiD!ggPVt(cD>+z zQM;3a^25`udZWyHBY1uz4_Q7&5x^%$f1o}US=W@|jLeC@(B~k{%gyb#PdhkpG*l9^ zd-dw;YiFxq4iCpN-Ii~qr9Ae%iBjQniHf8??k0H&Xx0Q@om7gR>jx zZ$Z^^RomJ6M|_j|NCrjZY>7ljrp=f2s&QwHkmfMEKzrPjE}H}+`**K3kDu5kqj~$J z7i2cKOZ#Cu+ui;js;aV7nz_!)Tw2{YES$Wzcf;bF{pV#(^J$f~lo_)EIrsRsS6kW^ z${RhI$}v@Xs?A+OXX&Ljso}BCuV2r3Y;Fb4+%ZMNx-h%<0!*}=1v1L_*mqK|=8IC5 zK1XhOQ(PHyNp*7=yGh?Ik9}8<%*z$m)K%W!Kw!+WvhH$LYKSgjH|vyK z3n`Q1!UC@a3@t0BwcWXbO{mWJw?)Kyr!@v01~%#kt1HwOC`)BA8N?Q3kWNq=#Xjyw z4Z~3J!WT-MiLu6bD)hY7qswvNlbNrbf^Ai8cd`lo@f)3}%^j(7yAR0Wo44+0({TEbs6?BG73mb`-TjY9LZ<{(jo!$mSU}Pj6vf; zCVXa|jmeA5;PM`Uhphp$ZM{8_?TfQtv@EU=5xr8}k$zTt`4avBUk?}QVhD=_dpJbQ zQYE7_aR$#KBI@WF?8x50#OG+lb+R}%Ha0Xgw79tVrDm(F=4)5~eivVJ(2;4&Av=}8qxAml94egrSPo^7H*H7-VZ3k$J{2200BXsD@UVq>e<-9Anh2hz&~ zxMpW(fBLj|+?2NoiPv5kE>Xz{juaC=rxzwkSD+%b;O{y!GxOue4>!CUDmH%G&(6cy zSxriM@aQqL!=I@V`x4d|Ibe8w)nFb!`)*#|2@ZkNCRA3)9hR3bD=VuqPhVBf*V*Uk zJd}R^{JDsT`+#vGRbE~mm+5eF?kXgHaG@tHEj3l2hcd3C*kZcz=TDJHY0;z3yg9}T zu5Z>R>bSYNUGUZ!nVB6AxB419S0R(_!KxMSs!t-8q(lN{6IMMri}e_E5qr>hA!EY) zYET=Yh1Oq@=hwE~A0I{&%mu_4W1K++3TL;joB^X4a+yU+E->6FOhX z>k-eSB!Da@D z-h2r+xPMb=Mi=mvkq?qI34k%0lD6hbRZ zQ#q~!#1%wcT~1zamL?T3vE_L6d0FX5UwL><8%isVz4h5uLZ|%y9-x0bO4JxsWT+P( zMQIhsBpB17-`v>vb}?gpW5eJT0i)mN&qPapnSF~!Mn>~RM!o4NX=!nKCWHBmoSgEU z5m8|v&y(Wv4~B$>*0)Yz!`V&$$CXwg6D9*|D^=#?O}We!E*<*Xb|IorW#u12ihl&% zzb^y^eYimh7sq0lPMFlB4+beG+3_Zrb3Y3b;XB~e7$z!r`1HXe^V+ZcnV4{r=w9DG zqg;HjB2rR@l}=o2So2u@7Fb#E2R}dwAFQ?q{e2tX23$NV-hr>n4>a>y;p)m83SO&! zoVNP22fj^b;c>UeptVEQWggbQ{X@+t?`SEkM{|5e*S@Ny9YXj*^QRD#-TKExv-WMU z3-HM&5h!RtV4ihp7#Wvte^1*3=`B1w zrnJ-^@Q5WPC1GJ@k23FP z!t?;Y45B_yq-kpe_x0=7iBBu) zJ`oYoy?ghVHN_tLLhRP>a$m&5^ZHbL1m~523yIIPMyB-`4tT~tdl%fGEt}R8Pr$8j zYD#eVvRZL_BhRx=VheE?jGy}vsSj9IkhKaYJ;}k2{M{x%Tz`*z$+NNm_5O)YPELCI z)PA8uk|(u}IDt&NY5_YZJ)zfk&^iH(hoiOFcTHL{_hVJWv6HD^`Yl_cX; z!0>*uxC~Vp?raVs$!{Na%}-sJu#}(z@>VAcs{k2mi)vrJRC(ljSb-g)QYx^o=e)T0 zU>h9Cd2m~|B+7^m30MqFOms}${?GJwNkL^`8$yowf@^JUEiIjy?t**?W~}}t8@y+> z1;a&fU&u(SNf2Df%vV4B{<^mD%#Hx0O5l^`<9_6jvc}`~0sdf9%vSVUhdh72MoY^b zFcdHa)t~eEPM}(Por)?tFi^|Dq+U==?Bbpc$Zi~revOpynkfP=Yzw+SVc+S)L%0K} z{33A>JJBGh35M#gK?147cfLK$x6sZ`i$fmxKQ$;8AvjJ*nY*kDS|Q5405bK8}bJ*u!~X8M=NlyS&E^ zlydk(fgdDTh|QGmR|)9rKfl&zm0`mspfQ8~mVz>FdaWpGCg-%%feUk<%c5zqrV^Or z;_8*)%gCCSh`v>XS31(hX-&yci1rG-M7lJ;a3Feh9&0h?Yd;kjY|j9~$y%YvDuhGs zA$Na{mgtKY?ZEjXAt5nu$k=qyNa~Xa=mu6+-^c>kAsQ4BcsE%6Ef%cO(G*qdOn~e* zmKGN;UCb=gA5T}#i(s_posD4ve-rEdsXX?Cg9Crol0oto0sanr_9dy4<#YwFU;!*MB`L-o zeH(WB@&;q-n;Tt73!Ca|_?-M7({xk~E)uG;3-??uS&Da{jH8#ejGsuk(*u=AyWrqp zFC|(ol$hrq5k)L7gwNUIrPCROH2(oQl7{UTD;K#Am6;uSS~=~gO$i!!&JFM6@8l2M z^LoZT8Z&;TDN6pi((5`utkT>a7QvXM#XNqPsbDQTywqSIP;1^bmU}&b`bE+Ghb%Ig z1Po26$HV~DocUc`dRL4br>6$<9pc=;X_+Tws!V@F&6_%|o?Oa5+n3~F3+May!ck@{I8>1$i~d?F zmOq7#!$q*w(+?Iy%rXhfGv)#>=lM4X%lxRQi9wNE4m?kAVM;oQ6WgqgWsq&Mp2@B@d# z))J`m&}LQi%HgD*%et$dblin!oaqwvNQlO$bW;tHtbQ_H7uOA@V?+B{PT8lE@0GX( zSVbC18PaYj;v@Q+m5z8TEcC6;W<0?yE6H-b6DNk)g}ZzWgy&NWwqeuU=r~$!${=N> zk7I_rz97KhkbmNb`_eGIT3QL0@_HAv@t2S%k!oC=$mz8=FoZ}nTvJ%*st!%IaO#_^ zXF>RJ@M&WiA(ll#BuY|w4UhC|EzzO(wC{lJox+%o2TL?w8kn8K!SjJK@4T9$s8GR0 z>nF3Lhs$9CX1Tl}-KYo7LY%nAE^qmA?9F@LEy*m>JCqHvq76)A72_(_yN=%XCb`Uy z3c%<-5xwRAq;Y?9<(xeaTdAj$N@H@xLC`;a%!E4Ny0nj8MPo~9D$>5=#Ml>|6 z1)LyuwSySH43;8$(5dvdWczp+b$_;Vzh^CK@|d0acD-2@_F3hPLygHAytrhd2`{2x zHRZY?UZV*XycFLzizfbRSFOtXm|QlMM+l`Iqs&oh^eL? zX^_ll*(p2RibGzN$aGv8L2B?jZTf~3XyStQjcF%|gXTPT)i@ZglwFY@4+oqO&VANF^+dg;B&`v)l6nA{K@#?$P z4ucpdy^=NSS>tC!)*R$j0FitMftM!IWnI1j1G0 zFj3BQTe^{rP{q~!MF%Zw>&=3QI0Gx@KBW{}@lO{<9HZ4rrj7mPlaa$phHvVv?b{qh zh^aRKvYVzLm6jX7ylsWAR%#0{gBbNbXt`;?5t`-YV++5&H{} z{JdF0Q@rt3ruG7dS?f*++O+8&u9R3aTMYjvr+A5E<5~JdW&Z^2B5=?ORY?|!)Gg{X zJ?gT)mzqc3P>t59dD8LXwtX{+BloS-<&_Jjys|324!|2Xx4=-rVs!kojHvZmVnY1_ z!v+)l3Wu-%gF93gkh^1NX9uhYV~?eix-!(udK*YC{Wj!K!t}he#cp{#LlpCCsYP>+ zuI5cW_P?6P$%IG09~m3V*XoI6WE_CoTR$>Lj-Ajd<@X|=vtlQtI?B3w-A^7aCnu-% zC>;L+c7~jWhUp~k3GB~OwrP$h%Tx?a%I((?7|v1&Iwma?KZ zHSQ2pIXgGE-LBq@j*bqffVkN|{LM$Hh=qj(;BO|NY&76WOig@VM$6|Fd;1$GXK8{uhRG??T26FI7nN@Ydjj19R-C>)mtPE=kK&E zk5+_R8Gy=&BNzdQ-``a4YAq;RFxb)beHohKnNLC3Uk8TzUw z<_Ykb0Ya42{8ot8|Ga^p;07Yx^fqSR=9RdTFIuq|8QXbA`|8CP-rb{st&n{{b}VkK zt${Pl9o*|q4sO8TY_BXF3?NO9nm+d%-t;zJ_NC09b5Zqmjd0lB%%Ex%Y{0~ZT#*Fq zGzL1kKovfnT9XczLDH)KK_oU`J@jqRPEAR9GutUL{%;%V{Vv6Tx!3^)_&Ch8g-6Uo z3HW(@j?Ft}a=#CSylAP#b>i$o0XURz@p>w@OoIxy_#M0@@0G=FWXJ!D<{J4+bJdUI ziS&I{c6_wofPf;MiZU~$U+5j6{E`5(2I%v@FuE9KZHd9$DL6EaduPQ=)u=HA@#g-# z(4uNa9ANxvhk%mcYHCHxz(MMxtgLKYArGh$pLFVJK$W;0c5K-@nV7H~+O&M*1oiZ0 zI&fhoBqpZ1%hBbT6kY@_Bne4LZM_F#d%(*=N=9}*34-7Dr|5gS_5~{iQgzH1$kdOA zwzHG7B%Aq8LVjk)d%fYodj>YW8QdY*@tQ zgzxiAglT>;3h?8bF*|4dtK70FcX&sR$YqwuI;k~u9M~i(OW%#VEO#N6j{vuc{g0Y( z463c?4V(3nY8zc!w~@$sD_`(QkoKy&mavxPT#k=OhQB%^%kKwA7AlkI@jIjc1r6M7HgmMU_B#)&q8#*@X%Z!W+d$&hbR|wJ_qx^&M6%uAb)}b1&@dc)qCu0ePRWFK>g{iSAF zPHInuY-F~i5)%Q&`lY58rD>UR_dr2N|3mIOwfEi3*CC)%h2c@I;&w}}ZqZZW4653< zkuMq~Cm`+5Cu0-I(%Z^RiMC{4m#NUCfGgx7=pCbu`TSDcTbh0(kla%M5E7?2(Wt-y zhi6Dxb;=OtHBljyD7W#jzo%p972Btj7`QJ-pVPs20LIw1@+%YBoOFj-=WGP}y6wIz zEs09O7$V)5{pmudZ|yHzd_taCzZ3E&`K&$yxq6lTE{FEcD*@{9y3e2eEKl@KBSqv- z{+`*#naRlvbaZsAtft*J%z^j%A41h_yvcE>WA&_*%KGX#l&8yj6midpCV4!fd8U2$ z1{|`o5y{2jrsZB|_N5Biy80?6$xHms1%c@n|9hp!%7T)BP?HmORj0V8={%TIIb%O~ zhW{0d=OL?wS28bYd8bGmEIw-?_YBBZw?mN1xl;qBrNlZqW|RV>40Z5+NKaNKf1%Yp zsl|N}xZGc8?m~?UtNG61;o;_HNe7g-`aANS%`O?yXV0c)XZ?(XxJH4|TIZMg{a*>J z-lgV{z^mof5^8y`JU1z(7s4@o93^V7*%Gv~ey#^yk{IPqy5PE7dUx#Tn%VM}?Z?U) z^igmgaiV_m8nqhBkhqYok|e%7`&4e=ik){@Dfa%Z9?Mw6 zrR!PyiZ@p`l^3ZeG>v@79e?O(tqp1~9#b09p{L0VHb*VWaAEH!c4?@9VKoF~h&> z>7kr2y_0ij97Mttf0YNkg-D!2_KJto<2Qgzb$m_NQlA>Hc|0pES2Bh89kOe*OE-<# z${VV0`y)4oF|Fdw<7U1!ng9`e43=;N>!0hqwP5TzKDxeMG)I)WwhBo_J+&n-;Ai@+ zTKx(2jpV;*b5WyUj5)9&ULox{g+=@DF!{}!#qY+Sk{u3%Zc}M#X&amENa!f=v$kZi z0|U)(U={faboaU8h5dc~@pGEar9VHj87W-^UpzV*%_M|w10tjE%`MB_88^@C_7s)W zcaGJ~nua_+Q|nqPg(!6sy0PGDaSqt1T(%fsa)8n7`p799xEw{aHZC9V`w{F?aQ$e2 zA@tKV3z>a>39vol|3s^CjnyddIYY&)%+%D_dE5U5SoJSe(4|C2M`!gdBAt(&lBUtW zMM|K4?k^;AgxAP)(e&#?)Qw#vqGXp7s>9vazpiVdAHpn!-zCI++2a<~&nFH|J%`R? zA@^~#X$0Lp8RC@@J!M4=j9B~Lz)je`EW_Ek~(O}l6RP#AC0!`UZ3(+g^73py3n zOZczumZt2n`VnqG9-*6i-zk41{lFrN??77`;MIY&2)@LKYbg_agS?3%Z|&Lp#Ystc zo*6Qxwl-Qdn5i3xRNB|BU0bz8>T?Li_q}n85DoAo$c*UZs-A_3LKZ^;Sx~J zgsBJbQAA2Z(VGT2B#b~u$XZxfNbZWc@ez5;K^5~zM2G&^S+^)jdDCGB$!jqW^}%D< zCfo0tt=b-AzOMdP5*qBtxxs|0OkXAk^e=* zsc*=1+i+kiSw~vmD~2GH!?{44)l_C>3k}>wFQ}QpMsNS?D?4(WdA&&K)KJf<% z!?|1$|LD+r)Ub<$AU#i*L~*VKyS+kWzay($n?G8Lc8#Q{+;Dz|?cCp^WTA7K&v|>% zg4gfxDkk1p0MY;DCYl-cuNq5plh_&%cYqY4vJGTsYVe6wp0D^BaiAT{n2KNjcts0L zu7F3p%iJQ`bNE1Ju{rDE*H%y%*rn|DvZlL@Vw~$Gir8V_*n#cu&WHNy-DN*k2E8Gz z2jkTiD@rSwbq{Bf%bm`J5NR*ud~?yuhVQ5>Ym{=Ag8T1z_v%ZCJ?SE)z*zl-I4?A4 zhXcu}3<+LDY{v5YkM3%%sP^TwesLy4jk&W-upPU)mT1c(DU z1!S0JCfmWSikwm&v`T9S2T!n3`YT@AXRfA|GMU2`g$S!|GgIb*`%}P?#-BIrH!NTi z1l4~$(r%cYWHi=U2QWogBOuTuhqnA2EULd=>CC_VCGV!Og!MeXJ~)=cSayfLR=;$p zZ;WB&ptGHZfkA(G+7qsi~0zuSBZ~9Ofh`*$fOuV%*j|3Wo#&a6kEM z{-zic?kzVV|*(3?~zx{Fmz*;5R{R?$u8J!j~rNncUNWE$X<+C?yL{JHG|t7rOz+Ce{OMn z77Od$%~N>3s^`F0@x8spgvxa@Y~~`+pNJ1W3u-R$3;%m}ouWyV=((n>K}eQokMmsSkgnSW zdgjvWnZT*gc>3D?=M=GJo_YR1*djq`hYj?hC5jPn8kZp!;-42wXbuvmp(E~&e38ujd1vwu6_Vx8i0U<0`vfZ*Ur>Pa;+ zth|i?WXsfSR!A?Viosm%=flz=NS4#PsqP!IY?SDArX0SK?lVBwT`UY8GL`mKW!N!0 zkM*AV6i~%f_L#5il2mj{Ynb#X|>n&>r^2@ z6(x>#%qE9=0!=I|BXCPS-(`lV9GNX{U#5HaK6`q4dU=s;39FF+_4)qKj+R+hnk0UK z9jUzz`v-ll+L$NDI`!nh=z<}TZfhMyE-YvP3vfe=dPvw_`bYPra126y`BLUmthHn! zU8Je7tgPen=e{2c-ADO=hvKuB*ZJ!%uCC8ZTA~&HMCi>ghi?rjEfX0SjX$NYdx3#> zIB1FsKA`)dTamU$B!`D^#Ck}`%aq2s=S<*KB@Px=+u7gTPaK#N)UWm;J1uo#xwx1C zcu2W=lBxuQV|XlQcn5oXJ9=%NJXPh|jr{hFG;hH8{!RVJ^>yGJy{QNf1YV1vAX)}q zo28ovzBGdtX*XH{f_2n?;pgL249?u%=#Z&MA|=-QQ#L(}zbO}V<6WqqhUz=|RJ}8^ z+f$WY7=rY){bJ>1DLaH!3$wJWQ=|R;U%Y(zX76dW`EWxZeS@HqlG3xbCs|6<_Z zFs|R_bwECsmk(0&f`js_+IF9UpE(&}_;zM5FA*^cvy@#r`-u|kNAc-s1xNzS$@i~A zwKpSDW3Dv?0QEs+5en#r3sS2~446_W;LDdckHY5>OG`^WWF8{mvwA1MRafMvK1Zfn zNp4Qg%cL$)m@wOau6nIhZfg!~!(%R7NBPunIRyohYu7eAM5(w1nh1e}0NhM($kFRd zYnC^f{GxJ6yWX~wnLwQieIu-Tw0nH^b=zk^-tfq*cll;!P2)F zKT@qctDM>P*2(|q+D~wteoMiL#lgByc{;iPD~HCJg$7R}`I^L; z1)TGRO#7#;21$#hehS{D7#H{Zz(Z;4tsGIxJ-0^bk~oZsbZ6lX4nBTafeI%}{D1tL zzcA|8pdjGCiUEoLu8ob21Y}zd#)PB21G!8V>)}u83~e^y60qtsk**ZL!rD0<%P^)G zAGL`aoL?UoW|fTEVA2TBZR()LNLZ#p_I|43i{!qqF`jd;ol&3^78wDySuTJ=^^lc9 zo}L1V_$&~6>5QnScv^2syhUL;J?*KfYQ~peN#UZ!8Zhsr9@V=qQ(&kJO860 zK){~3R!JV$EWCgd6A`_A6yAVrY!nZE?$Mi)nu?2u*VNhyUYCwnc6|%S&CAS|d7;Q0 zZBXEX4ijKV>NIOr0(LFj%t`3Y!miEupGY6fIZ!&Cz$U?Wr4~c~%eOpKtoHkcQK^U@ zUGr=QqNEY01K=+9knGnUxA&-7i#=0_9l1)ty_TE~~HOyV5Q{IpL=+TwJYXP>kYEZQ$gY{jlqaQ=e4nerH+*+EC4fq7gsKSl0==rr1R>sh#cHzD} zM}JSu`c@LfqETiN#h6e}Ko1osqrF?vBX4rGw2>jC{pbfSd`y%vceX0K+4L%Ua*$1} zSl>A_`szU|7%&6$7=VF&C!Y!l&7X$B6HxGOfjv?1C{s1Q16axe0s_`xxarYxFcOif^BfQR`69M(u`7f%}=CPMP>eBkJ+GcHLhWWw2xX?A5)1qAuv!`W)%`=Vss8J z#7-ONN&&GscmBM|NNEA6xDqFGnzPb7^g-$aqX7Q!6xJ zU_jNMj`7=gr#&_4HOwkW`~9#&aj)y07bC&0ZN(l6(+FE%_2Rha;f$Ndo$r5sX0=aV zIkEdvG0FfmKFE_iw<#B@!4Ae}G58Xori|tFl5`1Gp->j`IlcpYf#JfxbuEzw2{p#W z#-7`cTiikq4yvG+zi}FY!g)>^Ol=UCl9IPI2F0pr@nD&AUmCs90}gn)+s? zY4b<#5;p1Vuiho#(YbGYS4+eNzFi=d5?#KOI0TSj?#IxresAQGfeV;94PrcaQdv!Gn68tff7lF|LOm z!lfe)3-`B0!AJ1^Fi=Mzg8<^RIhy!{Vf)1_Qe0pH^N3up7-3OoPhszD}Tf^;4Z z)L1asbm{Z`>PKsVSfFr4zP!`84r#^602nZouusvr*DJB_8-YF)Fk~j)d_6*f^#tn_ zY(CAIkldybvvnNlb__vy>`{<8=^7O?=S^KS{$kkA)DfiS4G2rFiz#J9{l2f&dz^!! z()DT~x|5z*eLz`2uvC?K+_aq&ui^t=8DJJ9+#WiMb^ot#qV?-_`1b~@%q1tU>(ai* zDUCyV*=1fAChwOB?Ld;m-KYj@df%+E&tyCPWKQ;wpJXdv2Ql~hHF1Jcf+M$UDkFUd z_KH*(QbQKDj&GFPdh`5#mLDh>xSlA(aAL?au`rl|9bR)qqTTf!CD!TZ{4M?9-V<}; zCh=I!2K>nAj4~kn#jG#m6^)Zcue2R;_YaIijzznxv(s{>8XcYQ?l#1ex$;H2nB6+y zRL;++T1i_Psv4!tVdP2m@iJQj-`Z`uIA2?ZAD)!iiQ((YtZHP+! zMQb7AZMEC!vwCGI-Yz-*W}z8QAZ_@jE6q~q5FdZuq>uq8)*orO=(=cJGN?JYYEL8`6Wp}y!A zpNgoq;0KRkYjEab=Hzj@)ETa9ufD(m(V)9 zZcM2GeCFhRM~zJ9n@bMXL)!e|veNm`RI`Vi^IL79Efg1ODw1TP&5X;x_qC;EjF`&& z$8NuR1un!5TM2gpU9UN_LV|k-%3TJJ{e~&dVLiq^1rG0@cqt^gTwYK>L6bH03Wu(ZIvGH@-G4DGmn{$zS18o+XFm5;T;Ex|~fEx(5 zUAdWr{?#7vO1^^P#tqt%j2NgNNMI8aZw1}P#$L+;3r9~+k3;E z$`JT79TOl5kWM=?^;M7SP!~gg^@h2-kI2fdD*e-i4>r)IAuy^_Q&V?zbmW`yL}nHi z+BDw?J_pX(s`g1>>wk#DZZmlHhx+MN0R%w#+3rB%zEoSX*Cl|sFntetq7Vf)t}pK$ zqL4;Li?0`x?s9OTQ($k#9SjV(OG*|WvPTyrVJMs5*!H!qas$<-zc?I@jCZz-k|h$pFZ;=@-*7} z4_)zN4@1=ZeYXFMaoT7)u+VUGYcwjyMS*_7CCv&RReKAaAn^4SV4nsE=@kjr+oqx}t1P>P%%p0&-pK7q2ZT&(-PyYvdf3}vyB;hCWaKZ5T@s%@J zYUHQ(Hx`@*Op`KwueO*68VsbuKTAmL{Hn`A-7Lc1qi2zXad3slrQO{J^t=+e6fjt* zsE}Io0Q%ot4L}4dD-QKeorhudjHEs@tK&6O=~Giv0PX+$`4fzev1bnq2uMa!_W)67 znyEd4+j4MlFq!O52%i(0gS)@bL=}vxsE~PM^-DcTi!iI$L`M;3s8HU0ec{Rcsi1cL z5B>yHwXbSa)mYR^MJ94nUP^+`jx#$<`-*#l1=N7wlq^SlV5pDtR*&L5=U#yRHNt9*6Is#Mq*ivLypV?y?0vV>#q4Wh zHGJNyv=hH*{afsbxgQ}@TA&>V(EvOg{?UOO2!H?YTZ?i^5+&T^cUA+sKE-28@G*E?R#?(L_nZe>R&TqGu?ys8zC(C+K<_d*F zgjaV8>?A6|1jiY+MfE-bLrb^!4{#Cm?(QyV3z0AqB6#}rX?>AVU#3oqHTIfW&3S`Y0Uos%9_9%j;PE^ShTN zKQK-G|Lm2+yYUAZJ%Q<41bj@t3tJD=s1j^uU2AxTdGRm35TTJy_Zl3qCxWMU#UG5G zb^~1NA?FvkY_*Loen|vq(hO**0)GJ5l?qKqR8&;NyVjIu6nphK>gwvu&CQq6!Q2x> zZ7rCM`r)fwl2#9Dbz^^Zi1vo#azUSNYs4NP62N9!lOaFe2W%MV*cQP7kJ|_WpfDf| zPL}%*R`AIm0}G;O?vq?2_4%2e&RjTpPe34AWZHinNYzzGbIb+=d#4>M9T{2Y@Upg- zhk(DHL99S)AamiTT9b5%K-79W==TIgBzR2E*D9$O$DnEym+$^b>0scl4x$VPVP3fP zxd#_C!fby|y~+lexc+nYm%k)32fGRTeJ+SW7zKmprr3!+1Cu=d%i&_0@O;mLJ)Q90 zAmt8d^|8M1k7*R!MP$a}sC$f3u z|MXJ$-M#kOYpd$fiHXHd2-@V-x0~^Md@#2*DxoSh*XYtKRYm zq1>F)yRwcMKkw0@pRT+4;P3ccuEaLW5%JK7-|VeMo0VK8D4vn`bi1*#v|y2r5K^l{ z;Q#ONoB(@lkyo>sRS22tPSP(k0rxThofj(Z$*n*H>Ztj1yrV=*k73hD%44$5ScM8@ z1^}UI1{FO0Tz767x%7 z&&WmKOTel}gam0G-*XaNM~76mW0PhdRBIppTHkbr4B=LwIN&!p==7J`?qSU<(wxax z-l^tX8oq+$5XM23|6#V~=}1*Y=ndndr9*`F#?dCz_c}3TQ9dY^my^wRS|0&=X8NPs z+%Zf1fV~IhmeV&R(gSl4`?5ODTAIf?E58b6S1C}a0P7wAn6_R}aD7%+qTSrH>1-$? zk_GC9fB>ayA({y>i}{ZrIx!<&p`P{Yc?cbGGG{udbXWD3@P}2-r%4CW7h>pi%FQ_( zM~AKG8uFE2i=2>kyeMfhBitSP?U+<~@QuHy7ucu!FsmssPf5wPu)Wy|AJ5NndUn4zC1p)qB7p5qONi5y3eo`L-#BbA z=;v6w)G)(&7z~qZ0JV1i;z79>DLXu8R`bnytU?pmNe=dpKyK_JN%5Bd{|=D_GeRRY zqUy?}Ks9J;BB{w=`1l+ff-3RmDNr`$azM1dzih$9K~bZBZ$w?o68eVXe^+Df6bsQ} zVq!*kgB+WBwOd*JFyZgYz2^Oy<6je$u0k`BI_>W+$GMv5UYTzP%`4H&mv94b-%y$51>wFF%c09UwFV#yzqkt ziUSk%2jtuJv!sNo&(e9(s>8~FHqsztMFS|ohRWC(d7Wnj-(^W}119qSTd}G28|JJp zU$t`j&ujtA<;6-0AY z%-=zOG!b3O4rV_ioRcN{XXdiuKind;r6Y|M&tsLbPU8(G3=E9wBimL`5I_;QZPr(* zqzwI>XVc*)C-9f41Nz&qEoai}{|7P4?WRx$NkI7k6u|Wv)An2mC+f(|Js=UJzSk8% ztye~hGJW~)Jsa9SSV=T0rCB_l!K&!)YSYhad+kgA(L|K`%xeDflywceVI!OI1IS5) z{C`qSs1p|By^X)`n=JnFkzgi;$7DEoGIA1RZ95&-*uK1vWe}@o@9Ax3H`(e0#W3Z{ zpM=>$Nd?Sdf1LyVKB^GdLDR;Lo)WF9;mM*D(84iaR zsagC-6wcsRD*j*63ByiTfF27D^v@}ZI$Q{3ln35%0_GQNDagoRDMTOh63PyMxMy&{ zfLd(4+9MHeUsRE1mtgeDZL7vmqEk<5tt-t@b7sUNj$&akK9<)?qe4Qf_?FMX)5-_q z&tlts;M!Kg?ABS->-@zw|4EW@&WYTf{NJ1i$C|HZU@|=KUB0exl)Zj#O7}AyyycFE zu9Dg>s#>9;9a~o^D)p5uUuK*!0PQs*FMqlmR}KyRI{L;S8JCs8uX{+VAHVeWDZYOR zrx!@SeZLLW9HHJ0n}$2Nlel{Cu&iTZ`R#EE~FUexP#g zOhMV~V2r5TX>C7N$-YwSM0Z$7EC%x8y&*O?HHA!19IakhnUy4rSozdwe*Ic7xv`(M zeNyr~YCESSP}|BxN4nPH(Ik1uw2QjHoBz$tbbwq&(NL8=OZXqE4ldDGhFVl`pL1E8 zK}{Ga33}Std?X7=*L*foHZqo0{ZqwbV@4HL47sBUvouGNzLIL{>Z((eO1=lN!0)19 z@cD_7l2ZQmi$ajPQv{hlM@L6coS^>ZspZ!?9i6CX3lQN&4t!!WIz7l>l$YCaJ2~}sLt5|;`xQgTDhTNUjJgDa zov+ALRC6mSi)zDpKRs*y%@r~KxWE-9llo|+0IGA1Iay$UwR0IaRbw?MJ4dSD`g#M&rFBVX-7n?MwV)pYyPi{$mKx$8hs85 zz&!7T$6^8|0~|7`Tn+43>%eB{3}o8k(sySxyQ)`+;`mP9t*&;-Pg?##V!JZC%xRN8 z{&4rzoGAAdwpa=8x4i{!UbmLCG;4*lgiNcqY?h1ImC;?$zbl7X#tAqyJ|9nmIXdjQ z!9?cMf^t7x&B(~Ec0c`QIL$!umSeVEM^gA{y@!{63i=cDPx!^1+B4ks9z*(*U*2ek z9c5>v@GT^Zc~7{eU@2t`mNFJHTAWrobrTi1rN~U3tm7W{&hQOkc-#xv)+-#I4v{n# zIBLJH+Y){yk#7|XsjM_OQ~Ri@S}7{HJs0j``N7cKGw>7GIQqhlm%o*lFF!H`f&xGt zmMdjMtR>O!0~-M@Kn&!DEB$-nI53OJ6wJSjNCiPR8Yn|7$X;+ivCRUC93<}qFG9Ia zx5j@43quTV%3<8sW2H=2V1`@m_qu^e(5|ujnv~tPRmeg~Lf^I5YUB>7QqoRW zkv4p`H>A#zg^PQ-m^y3xl?Ka0m3-6{Zd*4bCG$fspm|474%c`1{jlC(=WW=7Wygh$ zds&khJ8@=|WLx~q{4yXr)m*wIex?R%-{pS?lm%Rbch!;f5;$?fpM~@L`jD0_wKV_F zmM!%|y|~jyKlsN-&sD$EQ##;5!UGjcXB#CShaTK=!xXXw6SK=CmG*ZvtDZ*MJ7Y)@ ztI$xI1gAMqYQLEiCS~W;+j1LrpDUWy3wTp#yT2(pgpq>r1ieeJn7Q8VW8ml5sDtm% z;bvWS$5v5|6a^8Ip7riHG$&e1ZN)r6ppX`q52%l%&6o3YdX|zHeF}$(zvp1Ikfp|KVsB@%X8r zw~l!KY|`AEUf2L18#qjd{4Rq}zmsIgnb#{yR{u#fJ#WJ6rH^Xs=t&B0o9&Xx~Uq_952eIrcr3ELp+M-tP zPh4nqF>X!SaGnnfq&+D3qpWmmPpU=H!;rkpdg)ZsOD|329}YUzV@DDt!()Hpe&P z2JX9`sGA@B@I5i`+)WfTat^s-*4F?F`sGUN4v8Ypq+k%e6g^=&;eVUpqD7wE?Fi?{ zJWoCpLV$&l*f)q7&Hq{KbDZtygEbu30}4{7$8D&ZZ=Y0~vY4&%*)_uZ;qv*ilC)Qs z5<9O*Dec}YA=~)W_@-wz>19`PT%RTUov<6tW3?K`>M$45jIor-;gbcL&~x0XomAGImOQj;h3IV&R_LbUHOP(TIwZP0XFSv11)N!wdP zUP**r6d&V6aY9^9MJ1a1<&1LXUF`XqU0D-aV?$yBK_xO7)zxH%f&AFsJdG!}#Int& z<8tHaX5)AqE0{JiTC{zcxAQMA7Z_kH7mR?#xZrq{WPj~D+N-L+eqpw!{eJ48TW)ei zTG&gaxa&*Ya?50HR=3%20xUQw-&-=HzM(krZ34X6^a#EPrR4eH)whVa3dcFD3OwPG zyb1Nz+edhdm?fuX=}uxqFyS@)GOJI?B8%pwM^Y`yRav82)mkuiR4-C9V+DfV=SO4@ zOzke0eHPbs#bvxEe^l_Sk})vHjmtVK-U4^7^)ETsTV$!Kw#E@O$gJPAcXzyIW`AD! z{p%2ld{<BlsvAiU!yde^VyE zi9Ep3Gt%kX0T>W#ITyn_Iin9C*5F`=#mL9|^2qucP!mc|)^V;X)7*U{+{y>uOt*=Bn-L1i)avE- zxV+XMfW|lNesmkCbnLss##S+aE+dH!b?q zLxTbGQJ$WjN=dd{fHe;b>mC>olc9%P#l_{kj#C^ShISsVu8O+O7dbpdcv#O6GdVjr zxS1Y}B5w@kOBZ;rzZUUKg?VeqcUUb`*KB(kq3Zzsd1w9$nt6Us_Lme<(aWoZ%l!JgT+rP*uPC$Vt!q3;)B8VY}&qxhU%w)@R z<>v4ztH5$JWbDU&jHk|CWTEb8x-^|m!~GLjMhnwmq{y6Fd_fAg|99B8&1UYAn?v&Z}3nv?CE<)tN{fPQrD-WEhjH=rg7xn~033>E+y zEabOGUHo>yxH?@)Wn|1|z(7+T%7)3_JbrQudi9vah#&vNxyKzz1_O^XP%yo@QEX&r zk(TZY2-Q9H;wle0N^?LPUxBAEZ108F7iBiSW->B=|n6TU8u-SyWn zJN>Gj?t@R&TTwO@3B$)4FXlbG`L7^&9Sv z(Q&MI{jVQw%Zj8goEW7I5@ggX{M1qQZS9YD%*-i^b9{xbC@Bw90~MS81*Ek{I0s8#M#YOkGw(5;k?{egAC+8>j3q9r$DoLGD(}=^J(L~edy)h%hXr6 zUwj@Xc(U9{F9d}7cI|{L=VZ0q2L;N0lBy1&_IS?Y8Z8YHDhg(Rp{(zIjJS(@tufFr z|7)2Z?hBesHBk82Ei00YDICLiUZr;&clc@fQ2oSXryZSj>F)Tc@WZFFkEocH8!>5qq5w;fXpf2s`M0=^tWs`W}yPm42y>;GQ<&^6zcP)B&7xBr^tIQ;u)*8`w*9m^p zP9xt<$E|_I@H#p%@#PbYYL}gUE1i$zbCbJk&NVxY%Rj8z5i#tq8DaBP74xE8Zg`KY+B_{lG%X>>fU>1??!KA|pl59K8VnXf;;J%f%HjUSvL)k4bb3nU~XK zOzBP>&wC#8{3jm}gWJs&x@QZz?&L;>`cXxma}WhX<0?{*u-Be~f=QQ7rd%LHSi z?0Om^d8`ork7BiSJfn!M>Qn!?`lhSG?Gp7#4%yUqrYZ{eo(6Q9Sfpm1+GF=KxqIaK zeYR~Vv5poYbJhuR(hfJ2fSmdlxLZ=Ps?ZOvW8zjU8k>t{|Cq(wZbS!yFA?#7#0)UPJ-jQyFVWf zEb5XflybCUV`7*KQ;g=C%`q)#nLU`% z0tWIyTp=-WK~-uGU+AbJGlMfhq_?_Sl&g1Ee-Y)`W)8a?H6Nx>wrg0Vm$$c$RaG7e zbLiAgiwGQZe#cHZgkU^4`1uJZpU^a8Q8r&HN|2kRAWkX6CcVE<8%Sjf$ zh>q*YiU;@M#^NJAzu_A~^i`(91i*yT@Rm+P3=_41Z|WbFqCQ zSQsDfo<^;mHP%&4M@0@N<(B+x1&Q*BR`*~- z^3d!k83m4Vj&ffKN3qlH#;V|FtD5*7s5#Djfpi~r4vF?6HT%R4xk#Eh^O0!mSkHVm zN+2aQn1E%a$L1mpCNA`%vg_Axh=0?!G`*dC-(!$wWT^A~_Z~kcYrzI0G@I#qll$7H zw+CT%&8i|6)9Y-OL2K<8PEK<0qKra9uW zJOMRYG)(Y7#7f3DmZ?45KF8K&(`o+pw>Ag1TYcKcAnQvXwb0yRyF&8Ts1^6g8hsiG zCF$xH2s*nx-^ar%*c48;F~;j)G^ZP3u-(fJef$Zs!_PPTSqO)!2%pYc4{e-mZA6MuV>-rWjSX_YoaIKH!9S9o zcb^o#k)W7jj}s_~QO z3YIG7)dStbvQMugU40JF$EXPjPl+6p*EbwC=Oc48jju{V46UH^Cr*`?8nuMZEgEjbSSlOX1UA^1BOWL-E92tqNN`x1yC%6)*;>8L* zTA{h)YYa&&<5_pAH4YUW>KBUOMM*4|1C54O?Om=UcWkpI-fc3-p}E|SA%P&@bp0*aj~Cdk7wf& zt3~F5r`*Z0Pi?k`)hI9zD+Mw3iZ!e-PX>Q+oCJ`F4jK`A#5BSCJ6=-hSG^Fm+>0IZ z>ORJUvPv1jDdDGcv(yv(J97S+lMcrE#CK?HuHwsNGyCCnEb-C?*yG@r5KzW1i=Gat zX%CzI=(xS+Tk?`Y4)654`Sq;8?PC5<9+oQy>%|Y6fg=6V8h@;(QE+=L+nI~}l$&Ck z9i`CJylEggpY?HNbFMOWDq%32q$djqyCA_b^h2G*3D_qs1$6XQ4y6{Yr|l?IO8P3Q z?XVr3^>76u1WZpxOi;v~!C@eLjYPkp$}hU8;Xq8R&EAo%aYMyfmX1v1u5VEbKVf89 z&Z0HFm$9h2)cP#7W6ZR^S*6?Yx27;D#ePEu=w;=&VAits;O41s-f#kVcac-mjk*-y zFL(Uq7w=t>F|+F~>Jt1wlu|I$`q)(=1Ug~i<}~$+Zdt5L0J^CR7z5N?;f&4dwUwXu z!LzfM%#Q=wYZU0I%Sr#FIk5#J3)4#~|4D^n+b|86*8!C=w$0N-X$8kG=%rjPZ{b|0 zey->BVDTz#WL+Gr($yJUDSLqnlza~FPg7xEgiBe{4~Oh+3OqTxY_oWE+{8M+3=cML zE(edNT1*9tX14EA$LR+89p;w6yE1yjUbd0U9_W+fPN)CfU5BXbwEY|zIg`uBq&96dBi#9);)NAbgl3Z7fVQM>x;k1S9XIsw(_jJqEN_t|a1smzNqbvOk!S z)BZLPVpQ*YFfT96%?;h0YYW9XKHN3x&nlFam*40V!8bH8U{=j<$h)N8taC3@y(AmB z+5)E{0RaJC-oXRj<@PJ*7F&|79qy+iKEdtEqXHg|KRP}6BZb~3b$gnw?d1~>fER9L zeea2$%)JOkMY)84*+gY}q3^-(TL}<_gqBbHtZPK-9<3OPeM>k%nC4LBWC?36j}Ffl zr+i}$&LLE%qSDici_w#1-8fOqo4S2CIGLxhCHRJ=I&4&u%J&^%MPrI~@TBCkZqcyR z8-ywm_5AS;8$*jsw(v5#XFf1r1k7|#&j{KM9Hfww(u%bfCa&z`vr|G$`Q z-?`}skflTkvu{W#r~o3-F}(6oDBp_(WkBTg?WwJYQA7&AqfWR0C z2QJ4Tj|QWwpe=*z+`Ej9p3j$Q4H^glOIU~N;q}ipW+S!Z1e`a zTaZx03Cj~t(jBj6TOceYA3@xh-Q1=E?RUtHBz;$OLJJs7Wi{{Q4dI z;o@JAa?tIIG0>bHwz#xbN$@}@B5)6H zhjJ_9s{Fm;cW)L}Zr;PQs3g>$B=yI0NeR?6S7>wKzAf5OCQMbMr7ANi7D1;=UA#_> z`yp0zVdl2I&0T8cQ?6gAu;ytb;IBBsqbU{$V%at^Qj%!x2Dh$xWus>@%+m7bk0_Io zMG$X&`TF&xkk)TBQKJ(z!efqyS=K&_D5KWI7ekB=i*vfYwrQr<$XY-7P}$NYBsbw7qTfYa;(p zxL8pBR1dLs_Hp{Bb_fDs=+ht=4JPPOoS<^u@%Whqk>)z(&k`AfS3KN-v*@e=MvLT2 z{01kt`-siHtRgrAPmL3vfp`B9Z)X=9LFez7Cq#%hNV_bfX^uin{bY38!x&||k!rd$ zO3yGZ$(K?2>#*}X!o#b_pAQOL_t806q4b9}3zE6{_kR^@XW1`Wm9gLb>hyw!wx*E| z4TTT{EyM=SFNJzY-SrVzZZqE*GcwF%;zms1?abfCl7(h%@vyKas=$Z~n{xNo3=E^+xRv-ln2E&T;A4A=GA za<)V}xR+3C2&6zPb~wvOCGEUoloi;&qUJe2<0jw%s?zN(OCYWY<=e5SXxl zO|Wx>3i)lvSn;sQFaedgoVgdMD{Ms*i55qwgSlg>X?f#DUaY9w0`RQY$3gQxF}W zpSNVL)nQOyzI?fIe=Ztm%|8S+Kd7;_wqBcSi`LiI-`9jkuteX}1%8!e0}i%)kOPPf zjM|G+2s92Q({>9Pt+pB$8j5e+JS@F*MmwS1co=FEbrxFJ z^aD?bF*~u0{Pd%Rb-08))jqh}ffJU(t3X(D*UGgW*6py+u8oZVb73yYo)DrVX+1p+ zOUv>$_v3}Z*Ldam0&S)jC<-y@vMF*qu)`Kd=i@!9Gc(_RpxEXmbg2VOhD5f2Oac$Sy4G^IF1inYz1u{>@jSb3~y5<5a%)@ z6=}8@MB?DOF$T5&$OnC|Z=kiJ-V*U+<(kTp*|jrH0dL_JQ!&XbG34`ub}5Vi6gW}? zO9#Cd!U1dmUV>ZV(x}EIy|kQ1=a@F(&=0>C3;S1%*{O8ZSI(Fje{*~3hJA$cKdDl=qXT-=h06MM%Uv6t^J&@t%7eEICM5*a^ z)^^%ixPCq^YtkvZZnR@pD=iMex$H?K_=ddto6({GT5h^63eGX8#to*CQm4=GJ;*_T zTDC$-kCJ~5w>Dq5Q@(;VXlF$-L0pq00U{&v-=F*h31Pn@#wXdv)j9Mxs;Gp#NKqs& z%y;S!;7{+fALQOsp`EMDCcKm1=$5v)MZo%~@)+%yD74DWB_QeL-fGI%g4zD+qc5TZ z#x)pUF@Mb^C(p@u5a?GOp{hkktGx^Fm)I@riFVe`a|dQy=aHVE^&4swrP_R?oclZEIelQ#t(tB+b9=;k z&DzG|ow6~f%$Wo3!?Yh3()DEgRx!eJuDcCdN%~hpbl0mGtNXw2$euQ#hdzm!lQde| z9oG;pO#3W4_kM6O6HobR7H-jFHerf^z_*OdVTj6gNo8B{pTwEMfOqR>38*g|Xh~YRim|_VDmX_^yv~GZY zbysw*Tz#chw}PJi@UxQ^Y-*B0MtTd7CZ(7WNG-PLst&Emjo!W3v0dSxb_3Uyj0F^% zxW(5e$kyj>!9r;50jszyJo;^0VqBZ~bLx$CmXW#6r&J&5#*_`CNQagIH}CjN}I0&3WQeS^MQQ^f1>*yIPq`0w%moGxG|VUcQK zQT>Rl%4Pp26;od3B#nSYcyu%cp1D~zU&rtyxn0AYH^#Oszn(}Bsj7(k{?^U=4HkIR zguw?idtdKqg;A$8(zs6(QS`YUu?=bU^~>+X7bkrAxEMj*dI?;gC`zwTQb~STPRJ)X zy~3%xu#tdTdQ`$s(T_(em}fxrTH}NmA(h>W7pH>}bLpG05EW@_GZB$|oWe;LKSuPa zCf{zoYWt#vSiIqR-ABM%hv^|o57tX=Yfin!up4Amwl8B+L@yWOU6|nT{a9A0fcOTj z$~YU6R5PLD%V@tUY0{2-e%%hIeo7=0~HTNT>bokPZKs1 zttGyd*bohmpI;zo878P zTy`V-oF6_NBYl($-}Acep%MEmd;DL?OUz~Yom7f55GP9NIGzbHFCesR@m*@?Mn{rM zzlR1z1w%vmb7+a0>sv6LPten0%ecT^S=Z?fMzLD*$q!9(K_TjUp z_ujg{IAVn){NFe3H9p_P#7#2()Ys$_@rioDk7<2=X^q$pwDelR$*F#nxy)K`J&@`W zbX_#uwi56z)dqHdTpjPR2EW%L@0lK z^e$Q>ngI5NkKTjeb+mV{HA=5LE1Cjc+@yeU;-#QK+k+cLpd zjZvA;#0?{LPr7*4_&*_kH$Ytpk0wAT(2o&$vw*Wr!)Zhtxww0N<1JwJ)(N!RwSs;m z9qq{-O(UEm-E^pTG5-Cz2j}eUE2(~L_X7r48@B4|g4u|sfqP*J_Q5Dr*C^#^|N6&V z4FQauHM`&^2a)r2cE3hVzw)o29OQO{2ilI|OkEflk=-%pC==y?%%uQ~((C_yT+zQX z9^&h7ymwBw@YyaO>;jhk71o7o!}pBp?KB zgdU-RcYgp5(9-z>ztWC^TvlW=Qk3vKr|#7eLc#EJ!~u12dGe6qk8QU84~PU+9Xt); zHE?VI(B>N{cMo~raeE!2J@aq|<#EmC#MNq z0x1yutkSt{Kevt>X+!y0l+&%;tk!n$7w#|M?c^7gEw(mC;RPqM*Vt{P6!g2>zKR>p zCB`5GbfxBybG1<}Vq%`oJPjXMSEDS|Yj{`ym;u(uX!hah7?^SI+|Cn#%47vDzE5*d z@W8n>3oVVZ4P-=8yAJAA_t+nZO}EIql(1a8MKcIbN#QYVVtdKA82W#UpNdzaKFc(h zS7=p_b3bU}Un`~zAK$+27HKeW4j-X={x0LEPdaq<-%yJlnLyI!=H$jIcCW{AA|RxPjc|&Rg}_-wNe$gp_z-%x{V}m{i+;g`Fzhg%%r0X!vG)?(?^9E^64qqv@y8m>wfuE-0qEf-6KKYv0uILoO&HvjBE3QlNAdLI(M>(VFR$$rS+k& z>C2fT>^&tNv6n*kJ{^QPG@b6B0N$1*kOkytU4s`WH`vgQMYpFxxAV;>uUL8Lb>HVr z=pFl)Qx-`HqPpw^L#}~To6_WYc-*Die*#5+lN_{Uu7fn}OVHba2%U`h{8Eq(0?lFA z+rin!LjTv<4l+H$8*lD$N`Ge+I`xyUKbUx3Q|*ZM=|Uk_PoGD*cLibG$}>0Y5?=cd zj5)tur0>dlJP)9yC%vAALM$!HpJ)H5f6JW5aKFHz!Pzn93T>g_x2PZb{Y;efWL6$7 z#;3mMvQb=$;&->ME|BNhGKZT5u$|^kYmCS5)SLtZB>e`;hR~IKwo5TFtV;$4xX0KL zv|w(2_VJe1F{IX{7mv_1^iPO4ck2w|J$4P4@;1K09SM_1q} z82&Vtc6;xgdhqMAaC2IMyQgwr=D2O3BgBlM;%&k)q%Pa5AX|psf#!W9iGFHPxr8J( zC*5W$hXmGq+~aOhUOL#x-UF>=L+_5ev{j zwZ0G5B=H5hoKw(w;#M^_qKPzMN|%hyTwqYaL372w6g~fb8R!@YZVd_)aF$XoL~Wv+ zFwsyJ6|Vt=U-y>ZYxm1?I6?H2_qjPgT=uP5xn^l$EM{QZKvG&X>F>!rZLjttGMluD zuI2c-)q3ttCRJDS(+&C-_N#3qu}R&~Li_xV`p^u3Js`b*w4^Gm0kRF=0)0 zr8W_(1qPA=Uu?W~H1<93~D z)FWX^q1@()%-eEFt$?;^~c{`dwXr_PI5V z7J*b>Wg@G}DWeGPzhP@3?sggjv0%@v=-m1fJ@E$i%MNxP5wrUXr<<>HH&ikPHUO#W z{b>sB;E`i;Bw8OhbPJ(129i)IRcarcn;$&Io0Xhv)*)Al;8xT`$d4#w@(kH#IfLxD zIrVkqC2D+m-A0W~czrFDkrDaC>0Wr^0Pdb_PhK>4*54nmJVFC6#N80iN<52qv<&ok zlaotN00?f}99^@f%v&#mZRKQb7gaSKy+ieIGIJvfAD|W&dO-QiOuuSI6Z6Q;zO|-X ze+@F`S%Ue?#HZmF{1WK8Nz&l#JOL#Z^`*?Ahxv9bAlcq(0}f5zljs~5p97zq>-tqT zN>Xl-Ah`>$faCX(B9Bo;-r#T66l~huq#DH75Q&%(x`cTP8Sgy4Os~ZPYE#KTa3c!P1^-eF z3qGA=>g$o2dslnH48RTqjQn!;&Ckp;-q9$fHEU-rzzR)11J{?4!1ipNocwoT~H)YinK2Tnd(hkxcMGOEMOLSpvz!q$b*V;G4vA(Wv zY@a8BLwgA3#fk;ZY#RfTroNi{&z@lvDc$UvrZ|k-Sl9iXUTI$34`Jhrw*<;;xi?9X zL=uUtM}GUB@c4@%$WFfK!wxOsb?p5}B%bh}?$;oyn^zp+YmD_kKVvqgX`m}IG% zD{4pEl*ip$OU>tKh8Pk$lCP&8GSdq3v{R9uCsiQhEpYr4X~Ie|^+ce>=4a^X3R)-m zg=0PeF=9hVSE}=Yh2S08Kc3Sn6{nogv(e4o1r$1OVJYS;+J9}ay zXdqzV1GGyLX=YKCP(C}@RBFza^*ipq!Pt8;g1O`up6E6HHzgiA@oCsseqK^Z2hD9u zQ7axhU8@9lr0LTbKa60ETG9&?)H&rhfn|2A#i?2B;@DUIqR8r&GRv zgPi99uPPWLjc&+nW8>Sfuh0GSV9L6a^I2Jj*?U*fMGQ@sK9$)~OG*UT?e8L6$i8-p zR04u8M^)Dj zl!RPI?sa^DHyCyNOUVBAprw5QV^vR253ndrNJuzD)BxpMM0j}KhEcpbDG7;Tm1^^j zQTT@&&I^h2{0etn0G2CvQSu9fFj5=^9|hIPswzV{?=InKd1{t9t>@3M_7V5$j^@RO zgwQ>HjLLrllL-{H;(|61dFLXQOJmeWe@2;9iGXsfo*p_0o~0y{Q3)Q`~D1!nofs=-THCZQ8nE~1rf&L4}2iwy-z}b)Rgu3)hisaE85dRmh?k-6*ZO6Ui0{ve@skpK>_sl$UQaH5=1WC3IC0jrW%_9h7IxW zS3q!bgL9gUy{b8(*vNfop@+`Aa2&1sjpJsQyj|M7U!E51QdXN3OH(Vj^wpNyD%tuz zoky;!S3oqFSe&-eu$_b5IGy;vg010``-crxwVw#f2`T{}gTftJW+r^-KLH`r<)AR? zh|^S20j-Wfm!d}!jpkg-_T}ekUhg+=uH&HZSb`o|mX?+;U(R(UOLljG1KV;*&+_6yT6A#G;&gqa*kafd4I}l3n7DXLYb#-JK<}5GM#}u#pXa}a z(DjKp#VkO-e>WeFi2Fcn#PKO9DLx}RThQ6oNK7nY0AcXbQnlRP-DbUlI9vk5!!rWi%lD^32(JX zUqFWOd>QqMa%Lh~o$z3E$LT3didWYwY#06#WR*r#*RR^Y!juJcblq@gW^h^OLFwg9yPx8=xp%aRTMm{}-}Pph9FqkZ|$ zY9onsPD&<98JM_`*p2q- z)9)Jb@1sL@{>ABcys~pZL=%7OK2H!$Mtst3+kxp&jEc>P?b$5F0D9$R_imO{pp;KN zpj=Gsl+5%+rt_wXgYL@s(Y?;R>G(`}$9w@vTTa`69I=4lgJaY0$ zrE5}pdbEcv%bhHso1Fj~NKEFo80cvyh4+0&C@3oW3kHQB@vp=~-Q3(BJ@TA2Zq$VO z6&&Gdic21Drz@sKIDAI{1JRGqi@sPjI804bGv6L~AFcl3Q@M}k7b%s{d!USUAG{=W z6>_&^fj0GFKtVFtC9bAd>WlUBZg|R7owthOF5$UwGWu@?C@2Zw=?(F_^Eu4q+jg9g1Nfb z%?kq#yEekm?|cN6D9)R67DI*p;SPM7CW~#Sj^u$&3kwSY0a$tRc^{ZHx>ZcFM8fq) z5q_eNK`S1aSiasikVqZ%JC;@(%6S`5eVd8@Rvwpvb_yl%E$f(XbM*?nU61$<5zPW+E15o4Dl*z3|{26@Noq{7I9Hk&zy29d)~2=P^8h zt-07h)G|asNxM@M{U37{i}!3EkUhhT_h&^vQ(^FAu(1f6ybnpV1u`2@S9QnMH{n?J zTQK_yXq7k=)_5%Kn>Dx$<$JfQTJ7fS;^JaslV|wM=zjYKcqDl_w&be2B}&6pe7RKC zkND9=lBIXet*ucM3o0eIEWogcc~?5h-HxX)jK*J$=aKEK)l$)1rdLA;(ChN=#4gBQ zWeRus9|!%%gvgPh?^!;CgHr>4z2V`=p7rl|w))$$O7-jqkn(9h=9z%@8Co5zOe9uO zS6WzU@9;&EM0p6fDX0xL!G0|*FXTK&evjk^xi| z+~IVPdeU%BqFFymJ8JZBt2@}o7yoMo*v=Z2Ja=v;bbumyZu^B=aAERo@dM4+;vzP^ zsBs;l6bJxjZL8Dgs+BEQxkW!4241A#4Phy8h)ihZ_s|2kut@rcb6?ZS!kc{$Ax!#n zuBFTl?;1!kArP-Ix5amRI+?{9kMTRctUTVn!pn7QId7J-V1qIbNS`or`w=&o1&<0+ z;_{YBgQ{RgOYCfNcGa(~z_ z1^$V^sWig}YHv>V27m<_~(YEFOvDTaF%3SQG$rmlQ&#HBUt6pD5EpeAyOU!Uw8sjtUNp$*a)=e; zT)hCm0DXt#0U5dpj3BE-yIvFBN!u=HSEl_o`wOo#iKqFAxb6ut+ce#XyA_48y=FQU zs0@O*DzwL{iMpr0E3`H6jSm^sGAxM;G{eFVlpjwO)RqI&1r~r`fDfWd3rT2nbcIDdjOGRW6C4-X04j~*tf$+LHe$lk-(9rA~OE}$r`nJpQ7x6IAnFt>* z6z3h@{O8EbX5cLs_B208AC{6~)$&}eIhzjvV;vpsW$vM_2iBZGX8$b4cpnIEv^Hj* zfT+R={m)q$DdUJ`v0^@rj`Vni7!YO1_vxpRd@Lb$<6C_g6ZYr&0+%2^T1b7pMMKm4 z!OxoFzWyDGO?IrK$*gq^UOf-`{c{a?D3oRLu&lv12MVrc_1yg9C&u`0I%dkn0@a{Z zzD;>zM<|LLOQnEdi`+0HObunt4$_b z`=Z@^Qp$7uRcfa-HfNegT!w>h#tDEcjm3Tm<5;}mEGeZr(U4%!jI}-ds}ByjURBU_ z>--{IK!43Z#-)p9=Rit)zjTkz_UF>>m%09D+r+6Ihz5ckqJG5Q4_C=SB zi%NbKnaKi`)TtK8r64eIG<*!p92rWhN2YYCQWPzB2Lrr!MXBw#75BwZ8I(=87?m z>5t=yv8u)D^fuZ~UAfp-_G4d>=eW2qKP2z5^aupXq1G$@5tlB|e4P8SmfymdBp*BU z1b7|cF}N}OhJS9YcOM(t@)@W%Eb71h3-pPTKP*jH%;X#jcxqx;0|ZwyGX^$CK|Hd4 zS1m;e(?u9bEk;U1M#pC7Lkq@3bU1o(Zt&MN74igaJ1CpGIRx$xPoYvmU-}<@3-i$g z2ADR}O@n(mJxFBl3hpWlvBFH73qeRN!rIA^Dn+#QJ5ll;1c|D|@|!~Y#6h=FVP^<@ z?F}&64QpvQp;9#oJVN%y7?@URshOUgC84I)mX+1vRimV%1Iv=?UFx4#d?RG_p*F%T zS8yA8TFv(>Ir7CE?;6-li4OpoFo3;dl!j(^TqK9xC+~b#!(btC30>jOOdMXadPKe0 z$ZPo{ynjF(2xOnR(5g5fCQnc)X-0q*=w@_BINW6k&haNIkXEc#3Rc zdz%Ng#WWDC+hi)c4FEi!gw@xV zjEy-Cdbm&nI$VDBzdhun+9rvKNLFU%c)l4a>jmn2GO2x%5)!^}^>-Vr`S6YPb?@!F znY-Gth2%6eqlvAFha%v0F5i?3@}369B+kSVeP zIxK$Un8+ByK2tF=k*2h&@Hj!B4#vmQ(qI5b=n_5A(j3n9^HaRoFm(egGyzy>!oMBm zfd^1}KG3m`zXyf?P?-B9V%ia12=F6N3|{7M18V1j0{soI^!ndQN@iCDTieH$i`34+ zyoR#1ZW+7K^Lu*)sR~CVE=-CZ<%>YT97gXgZS@;kigQWzJACuyQHlK&ukBc5Oblir zSEKF-C`vC2^_CFK?L!~Ife7Ei0>yEA|5FAgvVKI#J=S(}b3rto4}1RMCrl1HxANw* z3%k%KHz1RSiWG*4N5aB4zH9w#M`b`n&EV^$?;2RHu+vhp>kWGQebkQ=vUu?!hglnFY=KOB^~Spz6MP z<0)bC=@^gp75DuG>AE_CK3%Z=IP_w!TZ3;!x%DMrL!v<`4Q6+M0X2fx34YJwW@3#D|%4E1Qz#kaS& zD~cW!BqUVlHbPfnF6A4=)@YM^{od!c^mmucZi=qw1n`ZdKhU5+=@(aFWCWJX>vOJi zGM!k~ci*AdVwyErnA|)W*)^{1(tlz4)?e_FhYznaBf$SCnhqgQM-wXf(5O-&IkSBaff89kMnhH2*%~`>0#1J^a|-d{Vc4$juNE+sm!>Ol z%BiLrm+0;RdM>AU-UO6@tm-LBsOqcW0;8m>&Hqc|;)AR6_s~7n_I_K01E+W} z5(aiCE~Ods#x`Sbc?-tMeNqB$BeJ^M8*fQjNAzp6FbKP(0KX=Vt55IsONe|h;2vz0 ze{aI6sh`n@eMn?On_7pM+brGaQAx?KO*{;=?vaH!N}vtbV?Ixc<9}RQ1nJUSQkwmS ziDlmHH`3HQ-@!B?@Nz54*%|}jxW&ai&_eaRY$9Md@cr=&1d6e}=atE+Eg%UGFh!!urSaPUW^}=j#>E=(!$s{+MBn;Bd3Y z%)`E*KOHd@1@;-DHmbi-=|kDkM-z>9$apsalej+E(|D`zL+4wbhs?vS*e=AO+z-Xt zcU^53hy(Af)Lcanq=9LLPIclhKY2pK%uIjzG^yQB zdz|E5#7+aS)!~dB04CxRK$JAiNA-e3z<4#fCkhIkBZXF)#I_r-OE2FBAU{%;?|2p4 z05(BiN=3fi=v~J5#ogF9Z6^$7k5(27B37!Nf1B#+D4uZ!y9W&J#DE)o+<0XKwGxV( zkx+HVkp6Ef3(9N z8YhI$6*(!X#tN4WUYn8nRxnlqOgoz3p$-63Cazw+`t+mG0=&dGw^z$24|Gm8nSVWv z`46d$PIG4VNIF>D+U)4)NnV#zX^auwqYQr6IJ*#sShRrq$|00%4J4x2%&v-)xLkdN z`zC2Ns->!hbV6$|xzZDDe`^eTuKh)SpAk-Lg7bluDo8z9=h7`#TupGpK>n%rj|26) zB}#k-#B}NNIpt|c@W+p^=d*H?O;pZ+(O+))fk>?zxH1VZ8OxQF^<9{anh|wfq@q~H z%fFCzU1#zqat0Re>0`;prto`GtFRRyx;$Mx^1tMin%4z5Ue2`hE0PlMU;A?b|3W#r zD|4ScS^=w;!_B0sSNPRNlX(sH;bqhh$BcrT9;oO0V|!67oC5Ula|f|)-(MGRc*8y$ z03(7SAFaqDWw+$b=EtF>15OC(!{uSb1*$_Ntmc@_fO9B58Sj@gsmRd+EK}w`mx*LA zjAg^##%4KA*j2-@J3&k^mgB~a2NmTHWAvjcKSo3}O@JXyGyg)Y^4|Unv8q`Qa)R&2 z_RV!lD@NIJ4|cw=UKbXH%2U+gpVUkB7)1~zS~r1%=PfB!yPj#)7UaN%`6P)bhM0(V4Rv6 zjC`tWqi4{>uGMV`8d|Dm-j!XkAgT^`@hquxmyHDBm@JLdyM0o76$%AdU@t0rZ_!+! z()il4=!sD&yUUx?$Neu9tim-9(1$nM;SsEZL8J6R%a4B**Xf)5>Q*7YSevB$&p1>@ z)d7a^@nhYS6Q>_XnrgPWxjc!w=QZh&6YgctManAqXLS_fVO@w3Fw?H1YHT~v*DKk_ zhEEMNJ-gcfR_0hAP=MhYMj+E2z&Zq@TUz#B9J_sKJx@fGmUaA%9UV{p8r?$(lmlhz zygBh_^VrhK{u!E0*dbpq!`J182Z~{%YV0I&Rc4Cr|FbgLdd;pM7Z!Jn25CRjEzy~U zY_&olG$5e;koEK*UjvxModJ2?Updz%fXiLBG3vy`qNx#}kUKt{le={966&tQbSQ)L z+pAZ`gkD8`zL2i}*#0$IveRLk zb#n9bXMT`(LXUKuoR(WcVd4+YHXv@@ecslayyWkXPF8~2*9Yi%JRK|1WMNS>Ght7% z6cfYAXs7;c36;7)meKS$X{6WmabVoBUjKzdWXk<;lfr-AzM6Zd9{vCNVK$KZsY*!* zCH4hyu#Z1bXTA&hUn+(m_DdNngbXrg*7mEN!29|EL9v4$B)9foDhhBS{VrFo1}%8} zXZD~8^rwprxVia$?YTk~Gefsu|2Xn)wHpF{R zV+Up&kp@-tNLmEFg({#|G(amy%^dydNs%+z!NLL=&fMQbc;K(Zbz>#pw|hu~#`({PyXj5!V7eUCeu5N#=UaBy-m=7bfb#3xLL3PMl3*DQx3}JbeJBLd0$X5t zYBe1`PhmIGK?EdOqos}t#iO`+zo~DKLSXZp{dXz%ZZAxB{_|%cVJkJzP#B@CGAJQ& zA<%92FW#84*`?-@%)Fi5ccf z!2MHkkXch~&Q+yV&dgST0>bZ@zhjrXP#4fyfJeIuTgu~ecV6wDAe|^G=5&p1^J^ES z6S@TKbXCj+Z(BG->-&aK>?)_~{0WF2Nz;bjQOg zT|-X9O?q?OTTvAL+Y!Fm=kV}#PLE=DE^>~Oz!>(@U)jGAr{-WAd1Z-13uVK*U*7(8&jMb5pwibRCknVV`vTFfdmSWYGAHanF)ARki*eLqhySfPy z-e3mlJV6YwadKT&76M4riq6<9oOaa6#7-ZSTk)E!4xozQQ$M271-XgaeH6kM-NTl( z3cEiJBJiShkMgzOgi_Oh7hwWZX#ppuk3u|t%%|AO3}(%~e}Ccp`AGnhPfkv%rdR@$ zoC<(QCRSjuj5Tlm=mNX5e~p2RoE#e~D=RR%9ApGfsxbp5+*kr|)jq?));4Q7zF7~o z(Md3>k1qN01LIP>7kin27zK0MuWi64VFRdP`wA4Nn5)o1DYg!NvQs@bTPCP>JoBvt zT+7zjQ;g)V#FMv%oIUJs`t#H63MSrgQ3vFkb#bt_RC>F{%5bW(bk(HaikLvnizWuS_kM}Y$M-v#a#GBb{EBO)lQfZ6rIoCEy{vd)la939Vu%(j23pV}3RVz_x+m1}gttP5|4 z^gX3W2FQAx-RI)-o;ISbjDBu(AlL#EqY7H8>YCz5b9Ad&v@CIclRAODOLiA=xC=xz z4AMvmNy(Y%>GpV0k6)eesy9>X^k!#fdLUmw@#aJK64=$7@2rdm2M4pNWi)qo&U&Wg z516x(r}J73WY|^h-4qbeq(w16uH<=>7|gau0mLuXWO3mlz$cxCKRRPr4v!uY-x6J4 z1$>Qu*Pcpe_!!7jBX(a5(sW}M{cl}u)jGm*3f=X(`aUH(vgwlMWO>K~no}se^*iF0 zL1TIx9kCLA;b?A9_x_nPD~qxcVygN{;KEP2XxAgSH^2(w8K4B+?7v-~&6h!kQ46)L z88ffX`}f^+wx{!}BO{v;>w=E{Lq)(1;G^8l?VaX_R?~1W@T#7!>ESUH)VuEiQk|70 z;`~jp)`yl5%C0{tiUw~$#^ESH}wwlpc7%HyPcxXbQBVA zAStc0#;)4Ayp%Px>pNLjKkMt>-pe3uBY=bgGz?TkKHyfM{#9pqy;A9Y@^?n%*Uz?e zb;S!hS^6vlxD3#!rFgfds}-^nGa-DGIaOkiVFQ>gm=G+`$U!=&7@c@GGeuNHAV**H zx$by0i5&qRzMHm=!9mna(M}<@4>wShPt^7)rdD6{a(A}fKzxzyNt=kQMBo-UOfopd%r{eya^<|S-+Hc?49Or%T>XkSUVEo-z+ z`%Y3nv;JPAtOKa;f+SjLb_$G(JN5jlO%U^P6ohIE`~45@J){k+Zx1s&wfqADmX&6d z5b;3i0lWi!Nj3Q>FJ!ofwtR?;e1Wax+qp;r75v80!(aybeTLTF^Pwk>BiuK+!Ha-m z-dgC!`0Gl80}8g?#R z7HbS#DvcVbrHxv@q`x^ru>j+Kmup->|vSZ zZ-Pn&#qaQX%78{ZV)CF2GF#{q4%veC#crHAz%xw4=XakwGl3&uIM2i_Tg@l)e6cEW z+Adr#JN}+UNQ?208nMHDfOj%ZIo9pXL#Cb`pRy98X-;M6h#plHZFRn<=8=`B(6~8i zvY{ve3H24aiS6Q#N3fIaQM(4^kSE`ps&hT6fmTg56<28k~f9?6v8usj3F`F;d8Cll|f^-)MgX z?nbAmuUw*~wP%KAexot>`5SG+npXExflbD+4l@}0(LZ$RR`FShDn!>WgL;M&9*Dk< z{1Ef_M z%$#p?yaxVkPBG$OSDQxE^_^u3Q>DrW_3g>vt3MiFKL>k}o1aOS=eK${_|__HyoMW< z2QC47HGM5aZ$hT$zi+O@I|TsTT3vm1+it96Z6P%t`SUd70M9+hvK?S&a3wVu04Yb$ zO)}rSDa6x|kK&g6dG!C#7_Kyd-r9%euPvtM^SSHol2p=O;-cT^+GxoK+(#&bJ_(J> z=v&W*O>V2btP=%v)1rP_Njs2X_VRX4OklbDz`))%3?*)(fkHM4qA;9^YYvgU3SL z-kZFK8Uo3V$5ivgK-hk+7(^A#aJ5*LP?7I{udkzXawlv1Fs zAbqzyNJBgSeeo|4{oa}SSJJLvIFO;KU2MCx$t;9VMU^NV_(Df#3?%Li+R4f)D*5^O zZ@^^`ybC~2N^0uc+c0QR3aIzg5wW}iPW_=jwRh+C2BQ~c(Y{MUlpZ@^StU!8t&~NL zDJ(9BGq$HbLe<$Se%#{-VJ!#lApr~uH4GbbOa^%W+-cIpX|5jR$NS|zGi?#6si}{8 zX)ZIufd)d~b-56Lt-s`zk~ypY!huqBP(};3^@zDai%=A^pmPQ$15hYOJ!OWX@d)(g zji(k?VY&#PwpbE8R81$FIxY(L1rgmgZEc*CkTC6c!hc@GNOI5`I_|@LBfmD!r0u3$ z}zHyt1OD6U-T-qSR;nJ;iqlaA3BU}7?vX$?ao4wiw!&1G?>Q|q!|FwakH(E>YB z=|w^X1=_9LDDLu|o>VLi zE!{)7Fr^2)_lNZUxy@sg`wXrRzG&FtGatk)j>pCH?$Nu1>}A6A%^oXx=MceK&KoF_ zWzdJl4PVyZX~9v8bi>AC*`p^HCeKL!aSa)J)))-s8cCbYJkB+}eMP%`Ha8`ghmrxlrN+x|&jb)-7yf z6)6b=s$H$5U>4QLfXfx7(C|%#$9hzIr)S$I?-M~a=WWzBhhG%8lsWt!EN^F{Jlv#% z8hjR@4-M^{d#^}=mQ-pBnVfWU1kS*?K~FXD{y2iUUwhIE0zSuX!BMa%?`XY^Q~u`V zz_1)LKy1|X)&YDq3P>}wd5x86Np#;)l5gwECT_U`-siv7TaR(V=geA2nhudp?TzWW zE1`KADi&#rH1lTvD*Xt2RoLR)mcl^WFz8*$q50=tY`Rpbr~gZ{M5h7_2`^DmwcrwH z;B9kIk<-1b2CPl^l1AGAP%}1gcWYzFy=sCYBQ@028r5olMAxs+w0Z3J#Vb>5HIcUi zJ;d-APMedqDo}l$3dLJz(`#daP#k1;@P;DE`s1(varVLYb6c=%;7tttSjuy9g1Fz-H#OOfm-YXta?j$g zl=;QLm{1z>0F8O`Ij14&a)tBeU<~lQ&;p=dp#0BXwblj(MihCEOS+fdL(CO%k)GQj z{ouW*lpsH_=;1L*Wwn40Y?gj2y>jUo0O=;|@Ie>7mzn{@SI#c5Lu5N(LZU=HjzP6- zJ%*S8KFOyltTQ+);bElzOurdS4|mtVOxFV1e-i z?RS+G%-}8ksVlN;_}UT8vjlvq!B92wCaf>+oBAAb=asP*Tg5U$I;?;a@#gO#n=4`Hn+PnipB-o8*Bs(yiyLwxj8n3w1#t|DicmJIn z6~DUD=Jz)O7zif8#WDF8%~B$FJu~#LI_i;>6n>Y#Tq=J)N?J{e0m#*#Fm3aztz&() zftT1*n=Gxu)~vwUSLjkZOeE*FErtHr&cf@a_>Wj36hli&1&3=NwpagTwCS27RV_HG zfnXAn1AGYS$6j6`1N$cTk9|X{d!tE;N|+8~qB zlhHsf1N|DqafE!f^qzf$%rxRh?jTmqeYOG6gy?HWqV^$EK-{a7Ke^0N=&y^I1LEGD z2E|Z*20*v+{-t@s`H#!?x2nmnSZro?wlhXRW!L^K^btzjHLnD{ki+A11Od+V4b;X) zIY&uA_uQQ0Tj<}TdU;RjEbA$HvtJ3BR!35b$W>5S13_&#oxq5R^w~qmJnzaHj9{aU zwUw30=L_x|aB>iCGuP6|XuK8<$T7Qgx)BKe8Af&gQ%=eLWKQz<-K2iNnYVy1riMia z2dZx55S%TgG~Cny!1XW-`2VSNd^6KKTm;I%pTuuIE7o>g&wl%mz%CQ`B>Me4WD1s9 z|5;a6all^#d78e*_d8&BOP*h1=e)^|U^H!|T!MpaPhQgs$tqHN{^(6d`VL6CX(@Z* zhYWBLE>3c@v{PRHzFgao!)kf{S!B2J%|C8oO24Ya_NKzZLcp;ao*G>)ID&Qu!nKv# z_6U=0Tc8PaW#!=sL-lJ~0qPj~m9`Zd8XU}JSbJ{Guy^`BNUpEBOp|S|Ck2W;wOQw| zuGa(JZ>(|XZ0-1brU*T$3&aOqCmJF^8(3b^^Xtz}cI?t`cgC5`fZK$LST9F%g>V&? zFv>p*=0vhKwRS?@QC{R-=iOe`4m1po*XIIjx+mmjLMEj%i{8f%+{by;C@Eke`X6>;O=#{9u)TaV7j|fHI8H-E?z->#1T>kv`m0S> zWAc~6=fn(a{r%I|^+wU?SynIHNX~?tUgT=EvuRP0>EsC_$)(JMrnZrUH~Uk*q$wWg zU&ZNo=pUh`9U(!$aF>_N+*PGrr_n*D{)(9!ZifLSe*IwH$!jpOG=B$D*M6sN47Rg# zv^)8Hwo_m(&ebeyCk`940h`htr>*DB(Y{U8jU|Lu1TjMPp@yD_CN`~1US{UBerwDj z8?j_YhWtQ=cRL3XVh1emwKPe6iguyexue*jO3sP{2Ir~!U^e%?EWcsrlKFELn=0E1 z2QRJ@QX|n(bUBvytKGlVe$E~{5L-utpp@0+ABhEjZjZ!}G#JBLZmK(;DCPGKqEnOA zVh7Zu2(oL|V4e6!kh7SR<8K?<(P~lkOuY?+&W|K<_MwSh6e0CuC+Ct zfmr*G`zTzgn=XTg-D$x$#iurs1_uXu5z4s(edPp8@Vq`hUe3|{t?$Z|UvV97vD`J4 z8oQj4@Y>oz5*~8%>I2?yAKtahhHR{WJakZXGf*z2*~j;m)|>IglQ2B zs<=Z?PqG`!X~{cdy&scgp$h$R_@QzqPV-T^$q{7eMpOfpp`$*d72l8*B%#gAv* zSo$-%H^%c_2mN-!%gO|NZ48Q4uYlhYd~sctb`w^6?nptq+p-X($N&c|!mqb5tH#pX zFbWC^-n!M)pLw75UD6@M+U2^1Lv7i@p%Ad#_gmR&w89*+G~<#z_v=q)IS*@*iu7)v znl%~*$P!U&-b8cuw~^nQw_=m|vU)>Ino(Jyil)jCtC^TPQT> zoD2Q+K&C8P$Wz=Q_N#dFaBqm7%q4;&z@=iQ)Oygrjld*Zs^u{g0zK70y zIS{#=Piye6n><}#23jg#-seTI_=O?N6bD|*k};T9Lx))Qp{XDi5`{iSyU#amFs`-a z%{7X=K>aU|KSCj5Q9=cIpMAlW=?22>#b#POYXJB>M;|J$(~^o-akIc!o_c4aNIy_t z-d0lmQBSv?gVDXJ6_NTx0=U8zq&_SWBqgev0!!VIR?Iowa5DNq#TLHC`W)n z(-<$WT92JG!xPVrQlyJHvK6mk7fo+uz8Y74{0bpG|NRZ4Jvdb^Z^F)sWe?8H%oLVE zB5DkL-@RMg*#X0LimM#7k2)6%idBj#vT5(#cR5M;VfU~({19cP_2mU-8)n9-Ln~}@ zQk6wIYMebjYsFcZ|NRdB$_!Cda@rK z7dOsEQJ}P8VPkSl6ff)j1SA0+%>Qn5y=I}HKEy&1Y$=gHcNZ_+qHgR8Bn)PM@I>o)qEF2V6zVNLqzltmJ&nx`EhYEp~opaCd=EfK>1jw%@>-- z=(~zBUo|dWTUvFedhhgls#Y4G6kR;!K6KWHoR5zP`7PRgGW6}o$qOj2625sjBSe>c zoGos=Jj5pyM?{T;SW@TDqi(gwvXF#?gpQ6*OiU~%88rCbxzjy5`q7jgyMJ&XPMsGl zG;Jp%Ba>c8bpb!LMZ&L6`?IfJcHsN>-^}k3kkT1HchD$xFmZ5jP*(0`2o&PB{Kjnl zil>0JkAA?7}j7jGI-2M8Wb&n8=TUq zsRUY+x=hfTrJQcP&+dZLst9a4+0lcJMn%#{cdPXz(nI&$Cma4`gPwoH0Nb5|FKKBq zUw|)n8R)T&iJ=8E%#^=?Mn)v9z>nqS%)zCWazKYz7779bYJiJ^Mp?k*F^Yb9Jc*+FGwIi&mQLA0Qm*cu;G=PJx&JQrrz5RaP3H^MYaFGwxHeGpCTVkoa6 zqJ3z+<1)Srq*Ozt$%B}p_Y(0Rky3XyKOWGHW_CoWBU|);Z8{UTNMF{<-Hbr59C6|y_y8K|fTeGRD zLT?`(@7$f$(8S$`7N0wUJY@yNEWCA+z zRYsySe&g(`;9wroVqp9o@cuochXoyTNObhWsT-~?E>r z(B^Uj573dH@YQ)0hHo_fO@NsGCNgxO>#WCTm`5-q-98Q-JRXb3yR^`Pay1?*ZM_|$ z_M$Z)<>e+UGFXKs;E%6P=B0T%exqaxnyH6L$J1@qeIOBNsdI)sJIqGl!BMrHs zjAF1tKAo?&)OWM|fY}TN!f`Oqy|J;emXa#$;S((_A`tL(^F2vUf>5jHy9ggk>YO{K zR(I1K*mj{xj~qFdr{HO0!)p_b&1TlVGuF~_w8zf3_MpS#PH5Rr5n~LKfm~h3T$5jJ zH3H;`8k{2_oOAzv@#~kN^GH`n{&4};FTrUa>CBr4`Zob75I$#JZ7uNsC18M`A@pE4 zz!JjTP`J`|@zDiQS*ZtgFkDzdk0QQ}0^r(<9FrXN*7Xh~G4yKgvR@MQ@TN@FUGutN(_z zA5DNFwe8zXgqi)W>2YIP{X3&nA?wm_lbT4=5WB(7=HUyC78U^NL&D6u=$*lsf!)56-zE>0#ksm_XJQRwU zulNDzwXamm)GBoRi9n1ic}9@QJ?e$;3E{7{tMhc!=+w0bq>ICN_hYw*2iMkB>U@UD z@h<}RcU+u8+)CqSKg!0kvWVkxLOJQ z7e+|>d)f7mQ4|Eb8e1wjH!l8TAN@+_2dG_ZJG=6f>RU&QJ0AL?_+O5Sp{CM)UPp%L z!we%Pv)RDWph!V4(V4}9qIE6tzH`dX=SaXL$^|QG8rusb>q%72;fs%)Vw6AMZWecz zK88Rra|`21$v1Er*PySp&?+Ff^?=sy%0sqCv$ zti<3QkbmW3n=p{Re~q;9@mHvRBx-^!2&C{#nB0wBz)8nHH&&c{hfFW!|5iuH6*%QB z{mOCdgiAI@k4$a_>Lr9RKnRW?o+UVc9OZrF-Pl!d?)wrF>_hKxx0KKNNCn)ZqK>~- zSyffE7V1|RHzBNn3XN4Oa2bxvB#5Y0f(F54qqy$=h~v-9d=JsPOeTtOd3nz`l(j7M z-j|DcQBW=k8QGzzh&7Um^u>#OH!;^YVV$`D?dJ=$Y@R)PW@>7xqZ5-p`t6Rgiy@BD#1Z*NOq*e$j3jw7w}a)CO&Ux;;xlNVFnsUn=zlCAwxqq z`kpJRli)l&hGZXl&K~!VP>?2ig%%ruoaVmZ2XbcKd~P|vl(Mk_hN-cZ4bo)4t}@(s zg=A5SNMXb<(imPMH;Yt!gdli(7!+-o5Om=PxdA>Fz3qRl2_V=1R*4uLLD1tlh&k{J z=&KRpA7}?g?jIX|6;te_)BK#Z7MX7QPb3pX*bxRD96357~?+ z+ws;!U8eq`_2DpMn5lr7U$kxbzdiRWaO5n;;2j2VR^BmNufL_m?vEfKBir7C`o65Y zCVv`D!pS)kFXRV=WG!tJ*yM^H`t4iUXW;`ghPK04cmE~@7mWe7x9i+hiOJ!q(g*to z_g|PU!U>;C-^@?edeDEJEGz^yqKl^I}tjoX-TkLK#tFp%k8DtBHM1kJUd z&#$emfhh{s=?0t!+smPCGpBUdRG_GB`aHl-tlNi*ZtQhz$!4>^3oOhd+=og?fS9tz zJ3`(siPY=q&(_A89gcVm7Js$V+ne`H-$wGE`^QxZ#<)`nIDD2$di_NaG<$$C4W|Q4 zRaLKiu2~3*04X&1-H+TR4}r}LP!wDtl0+a71_s|3`_u1obEhR(eBq7slgyKpmX7DQ zzrn$Qgre#oSI+Kkx`32(0;=ncRk$>*@9ht zYnI(9r-OtB*m(_M~Y%}^>(-Xwpk69JC?criMeZs*-QuSDZ&+LDczz{rR zlES+NPpX@nqkTrFXJ@6Pq(V^lNUzAs%O~3hS5{O2cBJ&0W*W(%0+`RrYR*=VW?^Oq znl&BpkyR9hB~OJ_>^dbuqU}v*xA$Tr)Q*{0wU4&qphnUnmhLQ@Fkj#MAhY+=IP9Du z?HaJMTxdh?IAa7$%;KYPvCHGh?B!KzJr!N7}#eb!?g<3$RV?FHA74YK#8W}1H1S2COD{$*pw){41J!3`D z!LWJqC!dF(o?FcDrDu`qZQY68#=9KBKj1Y^<}z&aOmUBp zbKs(8-=#=uQ$s@KI?~3nIv&1GJtlVVJ@d-OU8OLCB17u~S*?L@Opo>UKO(I^q*qkb z9f+45;I8AG7vFUi+FI`^B7JAM(3gu3f}=5X>yDCt!RIDi2f=&z3KUs^skmDfzJOyR}Bz2gJyElw3><@Zua@@>hZi}2$zBY3T!=xBj8Io=WE#tqB{>}FlR zLx&4U5OHvJf+G~WXf9KO@qH|Fd$F5guA(v%*T_VxPL3u38n?k+b)BS*L*8=kE(=s* z63#FFsQFPJ>cZ#IcBGrdZq06A=X1F2xFK<#;47)JN!3nj5=4dxrQeUc$)cOf+Q6}= zG^IcM3`HMnR_s?hi}{O@#GO*koZWXi>p6OlU8iFwpKV&T8Zc}T-($Bj9J|c&fV;q6 z<^yJ>o=H22y5{_(wXx{aZobW+Nukhl&7kna;i+ugzG?P0(>wM-|6;0m!*EC2;IJKW z2kxZ~%=Mg1U|r$D(D3I+JsnK0RX}JQfLT$ViMij#NXqt1Pg|Q9K@cu_OZy5@1E7i* zblCF?TUNf(b_zfG2`JQ<-|MXrvrY%IXfxH)Tgs=IhR4Bf7Q3$bZhR?e6aeZH5<`5V zOByXvdsDoODV^UHO9eACD+Zotj9hsvPIWbA8|i%4dS z87b$J?!h(w7Fo%|o_LUVv5Qh`)XLhf!CPc!d*!Ta>0_5z2v(m2I404WT3-BUDn0Nj#5@U|nA&_Z0_Pgno}-Yoi`T;`AGf#9M@!)p zd2cx3<_9g0(G1uN@!XL&wchc>Lz*R&&l_(*u!_8%RK|hLm|d1_MxBwYTRwdU9&U*) zcc|N@g7xO%pD+BzO+*4b2OkfwFghSM-XzgCtgt2e3G&U8uC14qK#sH3_DSV53(=&V zKh0^MaE(GJVqZldS|=>7ah?;aTZ8rCs*wl$Oo#>%EU

7} zeY(Z&#q5@J=gxVz^=FD8<{GoVy)B@Il=g?;=gm*D(?~;GSV#pTgy~}2Uot5KF~AZWno@jw9FL(#jxxgPZHtRMg|y@NO){Ov zTV_wDS2at;QMY0gqiA7UumLwg)th-f)$t`}dzU|>i>n@cjhMB#{BeF;5FhTg0)7MH z8QJg}LGWH~J!>!VF@yZu#yRQi>};FE&rdT`;iLmt|@?5)QW`DL>hfc<8tsYC~CgLo#fx*Pn97}YEiLvXcr|$oTPGpx7f661 z&mX=heccUJ7tChBI%Eh9dd+@T?0Na#PR^ZU&+s4T)J>(c?^DE%TnxHN9$uc8!o!@$7M*;X6mY zag!rC$tri+6FlcIOUy#>Csu4D1~w1@ftjgkV?#q}8GATr+6tGDmY$oQE-Aa}t6~2E z{hFPVQC>lVFE*aw5UJBsIq@PO_7R>7sM-pW@8p6yI3y?3+FMp5#!!TP54!sB^DGbS zDX8~KFwMgmeCusIb6+SQ&>~llATJ08OrE8Xt>G4~PVQczwu*wjuhSGJ`u6I5RH~7{ zv|kuIjQ7rp2O8JuD~ypC4lwzW9+``eH4Lsuk4CH9(*@NXkq0t_awDF9qKuLr0=LC0F6u2zz8r zk=GxPxpH;D5%NM4(RG{ZTrdLERs*&1pm!SU*9M+mU;uxeRE6ZF!>q4G?)8fj!w*ro z!xO{BksRHSMuM2&^sX$|8({V0dGT&~7SfmvhC7A8w;{`eA@paw&cu53c4Z&!6R`!< zJNH;L#i^PzS4c1j_UYCU27}a?k#pgaf|HNc{3|jmt=gid4D@TUYvW(EAC&=`5=N>BjCYSx$+u~$usyz+) z2~?W}-HCeOK*+o;M6_45I&~js`oJYyks=Q&wbY$`8IVKkP9-zcb?cS&q4JfFhY*++f#*{#|g*x{Ean9npr zXWd;D#Evb0V9NDE_n+OKJZJWr{ix4Yt?FZ`5S>C%J7r znKW6iRkZHilC-b!wGM&igE` z;{0@g)>d{_t|tU^95c^NOdegu(YEx7zn(yJh-6O`8D>V*W5AVnB51 zM9Jgm-J_fda@Z#Vdp7Xqca%J#4rjgO`Dr*kUTlvnybUiK<4&8qF%bT5< zSXmzIIQUYfpesZCCnWR=iZC;3EXsyJxhR)Dc;Pc1a+v}OCzH+G2LPn|?)zv5$2rcI ztS^NH-G*2>I~mR|?Yx0|)Z2o@@s3m@#yonN#f}_DxR=<>wbBadG(&c3V;DEi&Od|y1Pl-`U0W}BHzVz}I?bTkzVb_IO<`1w= z@m+w|{XbpU*+)g_Zw#xcx)4)()smn6oQS#Ga|ut;8tZ3qsTz3`{ekwT$;ad zpsCxoPtZc~1CC}?oBMYnO2W##`axl8OAj}On`ox{N0)Q`v4QHO)BAbv-Fxxk1u!eE z2?zaD)z$ay?ChdnR}j~k$^1X?p=YOcla;?He*yjYk2+~k zUI+sF>wren1_1d3YQ!CC5!Bd8;KPXdtH0_eccT@G^08o^Y`zn#Bby}#!SZW=J|r-o z1MV9@`2XQyvO+2ViP~CPxT554hqbn~EdhsLu;d2XHy9ZDEuFNqMu1KlyobZ{=duW8 z+@{)9D~3^@3l>^o41f}3VDMseULJ7*-E1-6;W34kYI>;Z-yK<6iZ2?$v}CxkVb#G^~GlTGhaUcN~c zvbFsdJ*c0X_aXBq{GmyF$!U(WmmLu96)xkJRu9+!oHs(a1ta{)c1l$ID|lDzuZ!`u zU?NBmEk5z{0^@fj-Xf&a{>6A^ncv~go1NqZkQe|Br#U-tB>^g9p<~R+$(ht6bL^k< z7vs4tN<_Q{|Biwx_eb+sB@o_|O6}7WkZfDo(*%-)!x#E%Aj%^+^$e1Kypz4j|8=Q%IhSbESvf$@NriwNnxV zc*+<4&sM*SeN0i1IrmfG1Hro@XK86^ZH)m`u#8r+at~!Aa7W*q?-7he77Dnb#CZbV zjI@8HZA19jyc<5!N&&HJ+(FF*4udKvRy%5V-~BIA_}MfWpeQmqj^lE%rAwG+ev}e8kt2BlLdIFjwlK zfO(!;eoy=7wwIgpk9!&3uxDzq|Mk2ahBBd#EG8;k?l=&qUTL$72$FZpgdEenWZ`9I3YxUcEXyAzRvKI47YH17vsC^s{Ph|0&@W7 z0(n~9gMA-=a`v;D4CUpj0=K%kca=iYGBe*ix}a8cL%X>Ubap!knzdTn%W z6_+L;u%BSENjT}x$qmn}soDj)3Eln~);b?y0~HqSBgo8)Jr*>=Jwub%KOYDwAK|P+ zZCflnW!|`6mRYtbENE_FY9@9%lv^PTJb z+m~C{nrqH6=eWlh_qYcU(TDebLuiKGnKGmQsmdZDVH_VHbNqM^E({&EN}0{oI2UU# zJb5yXW_|N2+3m~}f7a>*Dwu!eY>9(EBj|XwHF}#FeOs`ldDPNuVc*|mle2 z$hO{+u~6IdvM)&2-{LeeFQhpvphTNq1paspzLI2=(+me(Z1@|>k1^y6+Q*T!Y0O^} zMRGEtVPHJ=>;`?%!nf4>7Ma&*_=Xv@Eo{h3G@)-*xJa(u$G>`e?2;^~!s-SHV9ozSnv~#2Sv5tZ%Nb)=>r`XcbC7 z^hx>w1aSJenHz5(pn_d!eFCU)3(>f^xrHw*F+RD&4T+LI8q>Be;WcvhGw)Tz>6310J{!)xA%h_aX{1d1BfcXhz~n3QmNyR&Hc#gMWh z4LiG8v!6xjtFerbWujjH>+1#!ner(xCEXo7mw=Z_oOS@ou)mhMvJ7{bOT@OEJ*xQR z_ZgP7!&SZ7sBWQhW0}39cNA5JDnP!z4AGs8c4Gn<{k*0PWJVHbb$uR-qdki;zd4KW zTy%zGquRo)Vp)%CKKU&Ly6JkVYQl*keQH7hm?6CA9qJ9=ZylvWK}H@l~e{HMHJHb zTSNhBrdMBkejh|)rJi|$fjUUU*rI3^;Q1wfkL5hwyR{SOKtYd&a<^I3`v(|X@a_V` zJ)KpX`0CZWY2SHTtbH|8(#2i{ZWNFjCX;O5Yyb9ZcJR^3teDdw3ij7p^{>I7loRi=i0-puWTm^XD zOgW@%85IRP<_U|*Ev?=-g_ovk-E0dR(b9OJ&>`^>|G^W?>mEAb>$Q#`s*%Evc5ihN zLJaCbk93)yMX)AZ-1Qk*t{Ilvw;nuKLBFWCbBt4V`{HES*z?=la%A~JHftD<2zjjFQ-_u6jYD8+~7)_ zk=8dz=O$z+qd#Tr^wbA4sXjg zOZaMk2(&E9-gej+aNP?%!efJHW^E60-7g$Bmj-inRLp0mNiZlhNtB}s8Z)xFw7xop zGBc^__1zM*O{Tl5Z$8)Tc+8gv=)1XwaG%`bNLfOuI=OAR2RT%i$jQjc^5Y+=x@4UC0f?&_hZdIQy@oZNT*dM=Nx%$!M}26$y6Vj6#K&bP!d1ti;t(XAyD zO|{+^jrYV`F;+Q#mB;*S>;PN?#lTc|A(3pt*{+;oSqphgur+hje#bFwTJ{E8c-$lf|klj`g zbcm*0hEy+FqjU`1Prh)5Hn?vrr)O}_aIaiHC4W_Id%TgT?abE+arZx742XYP%gmAV z75d6PA_Fq>Bkb}KZ$ZMwT`{3^akM(iI;kO$^cuF_(HkNNe+)YgY9N>GttsD+e=&|+ zJ?vJCO%i`E8z;&%_QQo zUx_Fw8KwweJ11?T@;TGYCN95V@;04>h3;IPmG37tyC1n*vCN_hSJkaj*IVf{V34|} zEY_3e3R7~b8mkq8{8a#jz%6Bh?6&i@{l23@|zqS2TIe`^g>Le6a=48|9Lc4h{|`5pLTy zli%vUeTS-e4C&7KZu|NGMI7F9%G6(t92tNYl(8+0$X!O3{JdVX)`<~P8M)|8amW@p z3~aRkjEnS0c3mO-vTY}^GH}{O;jpksotEDG^r^SJq@)u46~*tC0b*>{&e>UfXr2Bmefj zm*DzY*;6JFx;}np)J9W$0<=4VX%Te?Z(EQ-4NOX4j!t;mW@AaAwUq6?Y@UY3G%4EM z_V0~e>tB5M6NbMhY150ZVT<6>*zwFwt=;~fW{m3u%@CcSPO#yNLnm&+~F*2g&X~25b9YNB*6GZtt?Jy0VlJDX#e<}kmq{Ldlrs$oDru(QL zny|>oFot7ZLWH{Apa_`5mnQn=4rq}tp5St{g!Clr-R{|7!T1g0-rZ6m?bQiidcSO4 z{y2xXRS+5xuN$g+>brCK%=%eT(}zo)&lihx?+~;YjsBCf(~}6nAU_7AP08^isI}men9NNiLbZRN`?rwf> z@VxNw^wj()HeNx4zI%A+6O;QU$f$O3g2RA{!9rHPPb<~}P<0~X zwSn%W9*eI(^O$MZ$+o*mS7b@4F>7R4n1oDbHuK5JiSKUD2&^Nnt+USvev!)A({>?n z++4On^b}X}Bt9q}Vd_0PNwX#r@d%np3H>hWScuqd>|5AjldKGim>hoO#gkjA3^`we z_FkqKmo0{ziX}|oJnKh5^VxfaB^=+xYfT{zI1Wu4@i9da!!DpUm$1v7K_-u|mMTIm zg1N3nPOI&e3q@c3RJUbgEpao7*(^+deLK8zFz0z>^jK8YF7`0R7)kvz;{G!Sh6If! zs((0WrP|xnx`2(Y<9YOAUsbv?R6;hUeRkf!$t!xR&^0x$E<$1>h?x6(3YprpqNE?W z_oXdp1WgI$!Oof|keWO^bWr5iDsTdV+`tcSv27Rw#}C@CP!Labzi-_z4wNlhl+)#u zk$SCWMtOI3sXgUirvBhh9Ox(!5)*bPB!%>4@{8Dcucg=i~LEOkM8SOd}&|GsO+vqAO)}4h%KXxQE-eMyv9TmK-X+FSz(UP zK-lLs$hb?_uKt)cZ$RwF z&(FW69_OG5`U`R~9}#xc4#Yn)>v+B&3cXZ9xST@Eh%sghTiC^;6IHLKzL6>BJ+o1Z zs@x9%2@Zd0=*?9iZ|b60X;ns(BGE(|3CCP}xEd1X)Dk>1(`Y*nP7(HFPtbz*ce9JF zefrF+6KDaR1ay&pZG48!!3(d(V-7MK4bEdc_(<2?+{SNKWSSUonbsoW872KYRT-}4 z!Nbob%hrC!rW_T$#oW`zDDjum_|`lsKo;ptI`Dx6;ySqt<0Fdb*itQxzV)n? z{Cs3cqUYnImb0YB0m{A{a#UKi;{OCpAXr`EwF7yrIW|fEje@F~v=VFaC6F~E{SIvZ z>*$;G!5qaWX=!O7P052fJOX6!w8ko9Q}hT-neVP_x%JFwpi~dH&1(i$n7S zA3yeciORBYD>{MtN?9#86!lYGH-^I7QExq;JDpO6BGn{4W6vv_5%KIEefMVmLkJwr zvT)H7oJ*XHg5@`r*^!zggkQsNW~GT-GcEv^<7ayPhQrqAl?`Hi~DNKh<2vGiD`@C_rJdy8vfq5q3`(6`#}O{V^g@o>e%uoi_XUpDSFDvmIT*4T(z-XR_^UmoierzV2jir*IBJ(?n*xxwV2{XOy9OG5#zsIt+FU>f ztiz}4MW)W@Yj zrxG-QgK}I%gB=%ZiSv?h9Oa{H!s9wy41eEY3%8k5a$Astp`j&hHtIde7zLa2--Mq# zrG8U{YjH!S3n`AEO3OzMfK&wJrv*D}4y? zv}LM&)~(pI3c>Y?M|&mF-C9uQ*r3+2d+sHrNWgB8%d-lad#6B98c={T2`k4Hf9~Y$ zaX$x{h1mj=WJSR?4ONUw)4mEi9=mch_~n*BfgMx9<+z`&9^!qe*}{f<%5TvN(a`gr z<|26yI@D7;?Spe}i%J6QSMTo=vl-nFgaOh8B*^W3Gic^5!=&uhW}mC?ty&9s;j zizDF{AG#1h(CnqGJT913^&-_IcElC1GPXnpLt;G+_px=3B+MsSU#fX2XbF^jNBpmv z;S&c3*}wDYQPI)gYyYQ=dV<;&#;f;J!KqkIr-|6jW{T zwS55#AJT@|cf6V-u-JlL3~2i3n-3p8^de1^B7x*!Pw4sjY1J0ra^CO^7*r%^!t{_F zGA7aA%iJ+jA4B8TDe&N$;9!d!C&^9ULjF9y08bnw`ojSA^6p*#t}cgauNrq?_HnU6 zPZ{1Y)p>r6_r~mJtp|Kpc=fusL1w&P$I;+j>+S#EFQBLhh9hwj&k!uhzj;G1B__P= zxEc4&mjsHC8uL0K*mr7c7s3MR+2MH%%r1!VhZ+sE6Jzw9Z=qz`XmNYxIjJ(cc^Cy% zfA9&{b3yjZ(b8X4$-LQnn7Tc)v9Sc4MqL`+nV&vk+#JV0UqL*TBVwhp`S}geWD7Mc zbvBINc=7&c{BO8?sKhcI+fDx`KZX?V|JFIAKF06+sbN&v(l};5;wL8$AQ*-SUGqpR zyyc>rz;;CjaD*VxQDv2Z$E}V5RGY568VtL~5`KO=z%LkN`u&?4_#6z~P9waoE_N;! zHuEhwg$_fOe-|$=KyRMjkZD(xH1zG;th*~Kz3zPU^drA0h!~`*35)>gzi})*P)HI7 za7hkO1icENy&u6jY+$H~HIi!Dwd3FY`BEU~q`qvkqY2;{sD3<855vGn6AsmaME9mmYyju> zX*DP)zN~a~R~cX~ZEwxaLj9Qs;I}OZTe zVl1Ys&vJ7=Uec*+ZLjg!4*)e8}_Qpi3x zi%oG=6)lvNlrQJ|_cmNSE8C-*$ors*(}MnA3@6eu(+VwfwaeG5CYn$5jVw!zAB_wx ztXVc;K*a#y=&I5sf9m_Wr|=ZwAn_|(4(eBu#I`A5M2##fYaTG1W|_<(*kZ`A@PN&w zyl~^;+qbtzI`aQ-soI3Lx3f}muRQjbc=zo_a{NzSv5-N_m&o+3|Amm2!RXbl23nDw zV5qR`fS)`Au(x586;FA=Y3s2BkzILMT6#WfrJb2c$_KK--8z*P2TdQeU*}+SB=+=V zR96#%-J6?RJ3S3JJX{WyOAY<2MW`h?`L3nSQLdx*|I(50CubNC0LU*O*B{wsMn76S zG-NRm9#vs7|J@J^Oka8PreUI-&OhCZ&q0w6G{pC9fve`_A4i1B{#|mp-pw^%gO&M- zX^xt^+x2Aykz9@Z30?4l?SudMP!cO|bw~frxdPyV!uS`{N=k}(z&CFK$O+)yuNl<9 zJooEG7_=YRSn%<|EyaHP+Scz3@YYH0-^Y+kJZE!xq@Mih1kU99AKI<{BEM`ppYHe_ zLBJ>D;(KQgM%w42FRADH^(LGdT5NHkq*+wWmI%`7lmFLB=dhz?-4~A5fp7)h(hZ=^ z-32p6p2CZUaM1ugma|jYUoAoin~jK1iM#=KP&qg0U_V^!w`PDuo`5Gfe6A^ZQBmpV z&u2;uAd^Qm|7`?x_KRcN?`-Kp8$v7UHtQ2#{iIqF0zKq+SXo(t0J?+Y(*#im2M2li z;5_Yj0S}Se)R9tg$jPxuNp&>O>_Ng&c^!G)<#aDfYBC@|3KR(x=|R+~uOInx#^C1K z8X04Y7@xa2fA(V_nw-2&3({Dst+#QXM|j=k^#!@T{b$=62e_cYzYLR3#d*7J;2AN6 z=Xrf*{WoVv$IqcRpjaq^)d>h6;PE|8Hyau%RZ*@a4mzSC#p8NRYr-g7`RS8%S1%(H zyuS9m8Q+{^`Qo^q9lXDqWdvOv9qE}v!!Z{Z7oxw~i>M{wK`uDR`n~6PuBbX~?GpK# zIG7jD5&&8FU+rWWJqnKK&AOh%L+!I}nbyph>QyLbnBce&!x%u*NH+mmmfF#Mp%5M2 z<@L#7JCSZq^NMU+c{K=p!T-E&boMK>&5KWmD6h89ecm?A6k1{LlMleE1>*NHukBVb zr1aLFmb@KY`+@#Hwhl$|+K7^u>8Z9+bk$=gHUtCdyQYTk*jarJdh+OSdgu<`P0%-sEOD3c%!)yO2{tTJ&YKna^ zTHwO{&n8VLN$9w4ep;uOx;*G-#vmUVAMF*N;_GlVwr$0O`F?(KWJ^pa$^UQy8QHVc!s^F{{nM8hpdW1UnEz}QN+xuJBZXV& zHLqlBWF#KzQB-ta_u#e^8u`dcvNvcQ13Li{jrvb{giyg&NIA;1{z?TWqS8SRm%0*t z)i(;ieWMKEAsEpPtK#sxbnKiN1n1IkJyU<gnJoAMCQro*GEmbJ)`6&!#IT2&X2B(t^Y-@sF%>n|n4tamQI%C2sxQRq`$ zI*XOzP`@fQKZIEIZ@g;NY?YAIe*yF+a}9t%O~ymzV~_fEa=9{1e16V1mJYG-&j&sZ z*z*DR^W1H4GK_+?h7=4qzz0sW0>(F zCp-WbXUd2>Z+p#bZRhyZMZla&^zGY2c}fnK zCuiCoTjhDg3%J+ELp{0FWZ)5)Cixc>xV>bQC)k?n<{Ge@<0RN< z+20`Rm}dEzD#@vJnhYN-LNO#dQLEElO%vat_$rqviTfvv;Xe3*p24^k%`w1+3)j7gC_&$IgZoO7- z#lL0h!|5Sm9lqsUuu_laQ_?H3Fg*G_)t4_)#9W3E+cfc_l;fs?hc_^+b&$+a=4E7KG0SZWP8N}r_Yzwq4*J$KP&9BQDH?kj-RoVen|--wGD!G52hy<6%-WZ zoVNV<5pe{ji%oV%TMD|^JVhJ5T@&%;3obslSe^hW2?;4Vd00{dNhuw!-7h_%>j27% zZ!X7M8O6n{8D6o;&mKr;&W2HYOSCT1{MUlz6gsbuRfiqo>mIQx@Gl9rk2(#&Vwd&K zCWR~}L>`0)v6#Z_@MYKFT~(3o(a>zDvdn{P&$mtlyCEECgE#UgymK1JFi=$ss8rV z57whUk;1E@RQN+VMzExdNy!JZg$gKZo7)>JC@6;zW8Q;^`MrnhG`2UMpS&|0My}C; z!30ERU(c#-*L2fhJ9tC;XJcW1t*R1Ka*C~9-4VhBBeX@WIL69W?(X~e2GKE0r>Wu7 z(qx1NvE2L2-qT(>J1?%{bZqwu$N<4H!#`KDt3G?3TlkJ971)4|n= zij>jgc@H`#Cd^ohx&UiB-N%#*7_-W+rqq9$KgVGvx?>QxUd=1GhX!tLY zLRgpshr=nwlF-pTWeen;jwT%y{>whl17&Ik>oE3w^B=x~ZG$8Kbrh8FUrzavDG~C@ z;aQ*t@DY-(Tu05o7eoL$(bU`D{mmgE0l^qW`Io$ul+SVxKiKet4vP#6@fI3yZ`~&k z0X$%3m1csKc;8eZARh;Pj(|H-Vf!8{S^>f{YUC2$L#JkVeUv{}I zBez3oDf9jw5Ksea0z|stGBCMovc^6ej#EQGHVu?=yIudvH+_65>B9Zyl zjjlSkcz}leQ#*KsfP5$2cnDv1RUxnoa%sIkUgoYoc51`ly*hQjDCwsrwOD0r;ZHr8 zSNXXYlOvM*jc5#|ty?q|jo$gkZGSKmq_vQgRNpbHTQpYljq*_PDISRu=L_rbEr_INr-E z>(C7k+FIKP(xwr}NbXNva_MGFtr$defEq-w(KC%N)jhZ?PrIgI&HWOZ+pn>*VgE5! zq(JTB?9A84XLok!O~vMT(fBW#Hb_PW1_eF(sP+_OTi`C;1VLd04m%vhuviCE#fCF1 z0MP@kRYx6$dJpzIpg_a*&wheR1itx3V}~JQ!k+3tD6k4v>%3g;*H`kVahBQ4qaJ0C zF-9=+UX3k#p7k||RANk0E^C3@DC(oyOodb-GY_{9t{ zonsXjk7y3&i%nQ(FjYH}<(Hely#7bufl!no7~ug@A4SmXZj`>!Qi&pA1L6==BunaiqnxB@h$(>Ib^zO@;*(MnC|YmIkAudVF?q zQCZK`vYY}2p}`>>u$^stZu(yk&;9DCrcm%@%WYtE(?|`olXb9xsXAs&gMEK=2u5@= z===c_4;)Sbs?5*ogj~XWPEOe~p^ z+dI7SWqiIXKSnP4%zLWe9r1x7AtAxR#`ax17&ET0`V&3s8f<@G--md3m2MOuCDJ+{ ze@2}CfHpXBzj#?0=#Q9sb=wWKD`gg)IcRrwrsK>0>iA7g;(pBWCL`k57i zl7IuEnGf`&WCQkc_qV-tj@2r;oa$e@w84u9X`A5L#R+{GroJMml#*kT8qx6 zmMjM{#!L-t8nkHNEYN|>7A57o23ZA#5TJPWTORC}2norgBpaZXkpLNo{(ftqx22<_ z1L)E;2)OW46`g)+if#pQ{wAp5H}8YV8r0@yhfGzK?0vO~DAnFB zk&(@8y&6G>4Lq=wn9T2PD;!>~CnurNbBH|Z1B${q$ayg-Y47#M1~s+pN`#r1D z0V8XKwF%E7$BAssc^b&tz@cThr*nFExWQsuuY#Q15+;=|1X-$>n>cuu(mCJH?|Jp` z0HY12fy`hn_GID~)LNB6@vOlU)<{5QXP{mFgp<32kG4@*}x-kF7QAhrR`s*}W@I-SDh3fHbvjWa=4(U0)qsjjs-&V)g}-%3BJ zVP6f8K5t)Lh2C0oCb_%md@Fd_HXrEiu?g^=AxJ*O-3M93UD@tE$h~`XfY=5ykGnpy zLx!jYh|)^t_J?ayzztguwj1Ly7k$*x*=IL7E+s~|lLaV-<6~ay8w!mAAUPoa2giXQ zfrg|a+e7cRs`xkaHB|eVaN{jE+7Ipo|3!KAa33}mT)Lb~`9;)Rxhzf@SEq{_rlJ-b z-7r*7*5d|QzXh(?3PuVCog}{F+p{yk`_mwxw>N(*f_vl*`-o%`GM2_{{Kz=Ag{B+A zJ-z0m_}R$hNB5KqG@%IWH4`bkTTYq>U$xb(r}cjfD9o3*t>4VmsAiCrSh|Yv`HC#_6sNi(1xhnYa<2? z6eD9pBut;+%v3@GPM}#UDCFkpX}L)I&|IM2z1WKuQeb*eDxi(Y9fq&C#wsZsu`aFcxySjKRovrMB-O2#b(1|>QzNfLIpaZ4K~=NR?gr=zBZ zO_U^V!Ab+th5vO{jUA0Sgw;hOV`I$<_W@h(a{rbcEjv3MlNFM?`8$h4TGWm>QiPTDuY z;Se3MD|L1#{oB_s#LffC4%x+tNHWIbrryA>>iQ;>`a5j{yYR%de0D6=hwc&i9 zfle*j7s4!}$X9WQ+KqMpFVa@23t0ddkZ6nBI*FUmu^*;yWXr@g^pLtA*}fq5K<4Bp zvRfb5h2Z8CJV4|`4-(?MfzeX&*E*lAcL)`cT}@MZ`SZ7II7D%B>Ne((xbP#{iY2Eh zBn%X3?Ck)jGWdSCxM0|G0|t{xYW7ES#Y0+b6oWjq0!`#^%q$wpsQF|-5^e{;AvmJz z1o5_C&!2c3H5MO>+oqe6`|B^_t=}ayOZB`-L-7~`u?mOY%A$Ug2a;A6381{B4}hi7 z*cC| zfJO`Evr$pK4<>l8*T22)tk(?`cX7t#R#$KTpi*~O6&Lj5H~3CAr5vk~&m<=`A;tBb zh9S7RWXsufm$Yj-tP^-*APUwr5b`zZuJg+uey^LozHk-n9a~Pe1z8cNkAntdi`U6H zll+IP3IRYPSg_&3C5y7Zr>NnZR63o%l*XlmDXN!0dILfub3XVaupiEMey53+oCNv--Se!pp6$~#6aa8#goK3o{pv}(Q`LnK+X8X>q%it%Hm?NC|=OPL4+P9QXV1Y6MP_B1>q-jr+ z0;I$4ST1eAJ`=M63)(-=x^N81Ua9?xuD+AN=e0>rYLxic(X%z~($&9v2o+Nz(3Qut zsjTn~d-8EYa_jYRXD32`eOQcY{Ia}q(0 z@ju)0XLVB2ZmHF$5W9#ukbF?sdBScYNJtDp_X;Q;g2i4{H2tbsu(@xFP66$0L=kZx ztpc5jH&OE05uE*bQw^T(l9DJjCt)0jTG2lfDeuZ716D5nC(s`Rv@nO}+KtT-qMFIS zA{cn%-oHy*2?~=MU_c4T-G{%*$j+`J@{m~Jfg(uz_LYbKM9)XS$zQX`rT6T$|LSO0 zL>3yP`$<1@mXebnV)#%>IEB2DK^EQo#M=LYy6qGKvao6}`e0PjdgIEsYMrM2r(G$k zdhjl6JyItZ2tS#9sjZH3{t0Bax(`%;O9=*tz>D*r=m5}CQ&cB(iS_ttfm~Zz=HWSg z)CMn6Lh`Tge0Q&nMr0f##ppH27Ah1I!xQpYgRd!^Q6oBu{6$wy){opkkW$tit*uMk zY4&>FzCaBaj7e|T+MoQ!|9|~929tXQ`i^EpTLp%??0A9H%*=`GOGX~-vYe(e^WDbd zDiTvbic4ZJX*2!9fKsks{n1$R#wPKs3$;vk-V&yD_P$Z;Csz>>u~b~w9GzOOrhZrh z4(%Zbm4JUXzQU-zf8Td0nI+Xs33_^pqR^l8cz850kbi$auZWGcx-{aS4?DaK9HZuu zxP-sD+I2!`sHhbgOGgB`?w81*fLL)WRmD%!6Ay6Gm-fjnPP`#Ddb6OY2)?4m{}C8e-cO8fpx9{f)v3jrKY?<+n1lar(T40$o104)r?g zYs`Y)k9V8HagW9qUZS6pMx@%>opS& zov$sW#$XuGhP33VWezLPJ_XExL_4UbK zcy%hmxk#1BAL*>3*)@U<{Hj*!qK1c@Tpjt7w?X+AkD|Y)K(KfJfFp#tuIr3*WGqfb zI{3BlvJ=Q0Lf!$QtmKmDgT9}VeEr79IeM3(8d4KO1S^v|xaW7|!pi0Zkn-<*)J;R?DM* z)t_z{oWQ*E$wnN`_Y)uiaRQbG&*J<1(X@R}!35e+;2pD5=pb|i-JrIZf8ux-ZqY4( zTHaGgLOvXnVQB;D*=?k2{ZA9O`}*zv3id%t=mK6MKpE--r5&I}n)Nhq;ZJF2`}xHw z2&^2G2A#37BN&)Nd;{+O?KJOsTMAp;1tb}i_tl6%@_S*o_cBQ7BMuNboUqGnen!gy zHR4@=0>p3>s-hyjT)PabH`ae1th*YBre6!WTO&Lib!7HXm<{2O!~O1Gqr@77Hj!dh z^7<_Sr|bKBSQcrlE+!+7Q^R$9REL&`25%=HGKJkbX8oAuaoFZV#3=T@E#vk(4&xV1 zAuLOL1;T}V2HGOC+^>a&ho>9Ne_mYA-&Dhn6XqKOR}**>4sHq<#=fw-!wy~_Z+UuM zKu@GpkJ|_d_91q*J6-FsbjlCgXUQC2zXpnVReC1U(hF@t3NlOd%(E(r#RqGH+z;Jc zYkG~j3j-!~jW4MU);Gnp=?r z@67|FB(A4GGM}eVL1_g3XN3yhqv$H}@4# z+)Z_IU>i8avu0L;WO#VtksO$nI1d&wett^xLPk}P(H$ zcWNko;H9m5<{v&~^2I=(@ZKW|u>7JV*L&xJ17;Y|b*JQL+Bs~MHMKKU`T=OYi|tY} zGn18*xcHMDFyN%fxw!U%ALeEgLJ`yL_*l>j!wL=4Tfm@=WABiugMj#=rLhs~jFgiS znoipRGbR0?n$dWPQGQ$ABxaK2Ar3pLVDP3I?h!=8MN?C-6*(8-jPyN4Kng|rEa9OJ zqgL64i#@GZAe+L|Sty3U=SCPq2wmJm`-{RXk4p_Dm)(MJvCxM0tB2jbnp-c-k1ed< zU4=5xY(+aK__C){pRunW9Wt`xXK69^NAd4Tc_MFpd2pWi=*vApIV?)lCFdoOtG2o* z=ifh1{Qmt@czBf!gn=O=O&{kNOs}A2E*hCP<#j*yROKy3ch{hdMJ=1yHiz(iOKlF+ z<>aViB8c;CxZ3$iu6P;T6*@bJv*cVJEJClWSD<9RncMm~`6yS_#_$%`X%phR**zCG zfB&Kd$vZzj7~py^aZTOwXMZJxMahFRK>O1zrq|t*a;}yZ`Tz$^RsP}5`R;CKe zJ_f+pw)}^BeGP>rS!BpTnI3f)|28)*cf^7c3b-Xr7o_5Pu7-yYQ$ss@_NVk*U!IS* zxvwsG=W2lnTX;xvjqMs)!oI7p( z*eekp)@X>rF-mgUeaZSX`UGMzHmbq?Jr-$dZ{Ge%po^$CM>&zsDMxeQ}>=&&TT!&mEKK)ubQcvTSFhYqVnHRYfoyl5wAgoy)U4rD+!E z@Z|br-C0mZhARX0al+3X%#^&qh=T9UVqgkgWd{wNZ3ispA1?>kfZ|uV1&k$=%C^%f zP;V5m=dMud0ETBZDld%sJ2l^-6I2Sv2Kr{`vn4-}po$Gpyt5BM6u}Y^VS;{eADbfY z{ZaWc0qII!a)MC|LHA(g?l6O7d{rJDv~^G??nHayV?;6HYr!EeiQ>4@p`uMTo~b=Z zQv*)doy7Hs3kAqlZ|N!Ngh5GAS9qMvN8Rf##hv9ao?}9wQcL2z z&C!wZJsGE!@XW}_1($`(eS?`!m?_~t#O*y8^UEsvEW8^wM^@{}=w3jb43r3hxpr@- z#Uc?usC-N!|NVi_8MuoWe-`-C%VfaaQEKV4MOM`M)OS9frcd``bQES)qAe-stMu7y znc4sl>qEr>Gd)V4on-bg+Fo)@IIAa{Ho?M20yDU^;LS&A;=$jeO=GZ}L;%$p1hT$NE~xnzfTd=qc--S#}w!iPo1w#{6bXZ~YUEP}$uu;9a`^ zejvjC?3zJUk);{$ZOf3xDLuE(Mmt?mnBayr3#;ry;(UL@?5~X-XjZgBx%z=#pLs`bJ4Rx|V&CpVUd+=&$Y~No)6??P zMIF-ftn+2eGb~C`CyoW@Oa0RPqMn|b&@7A>O zSw*@|$v`X@IDB4)&W54Rz)*TH9uxx#tHOMY)N|m+_YbNTQJ=fuf5;=>g-I0U=bOwQ zU`WU)Dc$XUp>lvK>6fRye`UCpb|7*x{UUT$>EStjoOF10M_T06U0{)q1X-v#N_^0bCDKGC+k8{#I&Wens8rwzrd(SALVpxdS zxOWa9L*PLVzOb^hjRU3^IRKM1iDYu6dW6M|aYkALVvXS~irW{Y_v&UuDn0gFh8vTb znx4L&vAgvMhhErALIIc#A5yO5yNc%vskbufJQU2#dVyQD$j)S(s45~L#*FXPQ6GOV z@ec2ifTw`==qv}lAN|u!&B9NgM<^1pnW^es%{8>sr)Hf-^@#4)NCCjzY;?{1o-KlLtUR1peQdsbrkkx0aj_ zfj=oQ2>m2pfIZLr`~KcJBc;;ofIUG!&bdc*3m_zf-w5fW7Bv+M_y$A~_#6EDuN>uk z1kfuT^Eb=@O9^PCfG;H#JXUZ}md=5Tt$=u|w}0r5qkd5F+y^>v$p8l&!e0I;06|Kp z%ZE%Y^9@gXQ2jdP9z9)FN`$|cmLYFGGBM$FUhclwmi@uA#s}@szVyIkUwsh_J6vDi zY=s;gSP~MVX4-RdldI<#%~-fmi_N8qaOg+TO3UeoXied)=?% zOzWc(`OgKLQ|evcdY@ep8>_j?`gD^up78}=jHnz}`8Zy;3UCc(W~hCMwR27<8VvS7 zew^fZ(*^q$UpAgv;9%IEVZOO0;72>hz@Djf?{Eb$rjAKNVuY|<0^xRNWBmFxH;F#L(kqC!EyZ;^Tt zB|vNPtk}9B3E04YK&Te=1_KfBDZmBeG|k(HKYPZgRW8Y=>F9WL0fXwQs&ZIK6JBBP zcNb`?Aj+|Oam1$|!a>`7xepz~;jJf&dLhI-5t}dWA@+2(3q{J5Vx#4}ztgrW+W-~g3OmvKmMK>;z zbIf*j_8*>L&Z|((8lq~wuqd*XoL%N)_rBhko|bj-YcbrS!;J{KFhohb?KGfwt+GzE z5BW-&KHGHPH>%sga`!J0E zIXwplj*J?Yy@fzRc0KKH9K({0xv0|3MJzJlFupl%0?{ono~Ji^>7AGc12r@!+3S2A zC{}@SNg2%IOwh%>!?@n=)AgRqmuMEH7w3YLB?ieU%f7STWxV&32sBGPB7#V74YxeM z-V4AbP2pH_Ruy{=MX1-Tszu(QI5sYE4_xp&N`pWI;<=$=ksKjLEpeMp-dLmpIJDZn z{{Ae`W{WVhEyCAm{0r3>0o4Hb{S~N5F~JRg5j-9g0a8BA^Bvt6up{ZG^z;^7+OJ=~ zmbzE2_##J@Ld3{uIaehcw6B2;h7b)$Q>olt4Q%tmlBv$G+IeL#ntjL#t_Jv}`)SL$2~K(^|3E8T82yX9fM2vR&e zHWro^Asl@ok0pa)VU|KQ7CUyU2=qzyg3@6(SGW)11V>UVp$O3+;<~&yiZj$m#+Bw; z;J)H>|MWX5Bmqcw`#dTP8S-jh<~tY=+ms!D<6I2O>-+o0(9~RFIO4${iT4zbQF$&p zL4BG8qi8%nA;E&^OR)_`)laLFTovlg+TLWLg(OPUaEM~#>ABVyq@(~4&8{PE^gZKjI>3=h>2%i4(?Hk^^iYD}q z_&B#&KXirE#Jz49OHFfhB@M3c-$uH01UJ$)wF)L(XAKg*I9ToH zj_(DB7&fc=QW|V%#LG>7-1Jg5H zdT@RF0j-EZ4@10ikabQA9hsm#>*Ej4tLT-BNtP=E(h zfxE}ru}?{Lh67*3DMV4vJ1a8owZE8z#THf!?zU9BIl3wI$GEy?4&WKw%6IAH?}-0W zgGffiiqB8Dvid_*)Jni%Z6eWmCM=@tyGev)MOJhqf4k5d{bMj}ST4SFLY{%Q`X+e| zxJdXrA(Dz9@IzhK)7Ertqnla1_NTzYu9Mmzmv!H#kmL_FDOC9QtIwYOoS9s=1<{?V z#XbOxRVlm3zUZ}{mZodC)2=sHLMca45C3r-hA(x$@O?)5m2-)>Yq2Oxjf2+%xG}_B z7J8d=#mm;R<3+{PyAU4Q&nJ+Dq01vk^?8V1ww)JpjPygyP+^F+mKHk;3y0g`-HYRg ziv;%?@{!{$ikz{uY$`+B*E-cz)yB$R?jE9`GhvkP7ELuN-f4owmKZjz52Cuk?cxpM z7T_zZoEn9PwY61VUT&Y>(VS^T*&u3I{=^tEXH$;*@I~=UPEP1s!v)=2Tl@!~>5Ief zu0H>)MUVdXi+y*DNXcZdC^A7b{QL*?sKsOHNxiTc?q)F)y%HX z6Tz&kM2Pdg`&>ZR8L3%{5*hd5i-fktb#(MZszh6=M>x^|rE%UlhCD!26Ofp_?efem zBjRiJ^M7Ej?ega%nh6ZQzwzxLw{clbbu|yjW|ox=FWUhUpB^H*P_1-v16zX@s(R;Z z-5|J$G>@Ht&BcZqzIvN>P=*4Ww*+V8ni9@8|G#fY4ky*+uRtUM#lKleRdva!RxbVE zD-gfKW7I;lB)3=;?am6D8xF%yNFNq~BFySX1L{d>DckWKhUCoDE^Pk|y)6X|4FXgQ zB2*r755}0o3b0J@nZa)qL*h_w{cY)KV*$JPaBnQjR|mRGGajLSUommzPZ}6dtlY8F z%M_7LeuN0X(sx}Db|V3(_dlUdpJ42KXgXc&?7g%0G`(b0R$ch|ob`KS9BX1l1lI0++q^tG5aH#3zkj%` z#$^@d<%cIH$QR-s!eINnQ_nQNieT@F-vW5@uVmt&DFQYD?r#XXaSD#Qaya`ZF}EP; zR`(;rw_p!)7M(#n)!R$U#vUUHy-tV)hxk_(=|K<|z|Wg-h>LjF`%2#J+--rn{XYXl%|;&OAP@cC0IfQdziC1?OkkM8~(Octnhcc0+mZt=Sv$~}4$G!CgY zk}-03R~HvIbvW!nCr+aJoB=%IU(h=vcs8|382o_00=2m>07tULr|at*H?`;Iw_8-I zuD_%|C0!$6cy(Ua%2rkH8N%n>9}6DgptnE{t_r*tAZ1dV7>ZS8shuRww0wziso2sc z$jZVJ8C`j+|GpOoNG1>jx|>n_?MFevQ}V-ycTD47)OB)gYu%}+L{*d@l$FIa`7Lnw zHNs6B#2)W@``=g!RJiPw85x}|DB+n3`};%v(62R>1`OT#>zkX^7Z+EH5sO#Hq1!)kKZbV=R5di}UmF|!bkWz*mB&0*Sq(nlbQ9@$q?naOyq>-U>fNu}R z?fpE@`^EbBhqYY8c+Nii?0sFo+A~cBm=&pP3K)Lga67w_YhR7n>WrN6g(Z(aEjI=X zfJV*B-SQ=Rv~ZmoFMd%4#c)D`Bi?RFVDrdz0M)tRR95zE-5}e;wJ%x@P=z%ITg=qb zweC+iJO#zK;r*eReL~Q={eI3^^UzO%m;Jw~(e2fiw0CvoW{$ootMi6s zV5&uPSZ^v4p})3O!E@bI^wZAjbV*_1yZ7I@?;;)d3U%T#a@s*c2E!;xTdJ<37-B=T z0)v=nDXrSqY!LLAFJHU}O0mMLK$pY6%St?#($N9R{iLd@M}FyZt7CgAbG5?YENKIt zPNTAd*I$}*++-S7$Yn~D??pzYcoGF7R6wcs8%M#DkvU}}RTW$y6EZ=F5drWlJPa=%ma49+_L0|{_4I|Z!BqAX>F`I1Nj8uu%C zV1n3u7*hLFO*Ub3omKJ#YBDwV@pFW`oj++krVl08>opJ<^^F=E>j3I1qPFH7=o`De zH4@nI3dXwk`cjsvn|=}+ANzfw5q(HZJ3i9sqlCJe0+(ON_nQu+lX$i1mH_NW;_@=a z%5ZdQdEG96@gnIVyLduGsFRXwqJ9{|tjg6ID105H-G6#2vhW)6P znTYj_PAQv+wjfDK$z6$vyY~}X1dO^~R!Y6f7vRaHU|^_65hEmGvY}2~1Likt%0-mK z#8duN2Fk%H8tZPg?yptC0m|SvjGMrzGy`d)%p2D1TEAfAK4{Wn%M8=yDr2jV}By@H6 zGdDM@Z8Y5^0G`aP)$`rrs(TGUB&Yr(A5fuy>(2HC7}fe3hbN*)OMXp3S2rkCK7{to z%^N$CyN+AuP@xu8p*NYS{#iJ##v<^B4VaXSjmLi$1pNc${t;{olUvJ%Y>ehGY**@I52qmXTaTKex{fXJ3)>(&dNDI`- zqoF#wr}Lc_R{j@B?!)9Q{#ilp>A}-a3{uZY^W4V!HSE+Q>T+oqdArL)OA-?L^K;P& zT`HG7Ig=j|ISUHL*2=KVd$Bfv74>dwo*yxN&5-tT^k`}7qbsjn8k#*}ATyB%o=Ik6f1{Keq!Opie z!q1Ir&wHLAwojq)qrY7`Bzh#V#11b^$5htub^4z=ftzh*-qb8(r zi16kWdb+&~%o!mDC0?P*ernc70%_=EAB21^IS>QRo_ucvnnY|DTW%l0LQ`SnWtb4~A)`h02W2wzlm-;q7W zAi9(Tu%%xB4!VE<6}I31tSd;RL1l@Cm7aLJ4Bk2S*G7v*fqt!eok$;)u}Wdj>o_{X zYVBa!@6af0*FnrPaO(Uoa0=%4vrzf#3sJ;meBcUyeXr6OwxH8|DLORju*@RcGIFJK{J^ zUsIUN)nIlL$J^mCpopUG(8CAL5Gd_+n1WFR8etpUfPkHIex4#e{=uSd(IF@;eq;OQ zpFG#8q8H4rRef5i-(eOf?+mQ^u?|NGZO0^fzjQ!tN>B8<*KMRd^Xg}zM{tt!5OH^pffMt<57Omm} zw+TZvl_ zWu>WE;}q2d;1Y4v4XgE#OaNN3R!>iV+}Lo?xG?IlF{AFKqkJ*bv!M3-6Qrt?yD4a@ z5Lqf55TFD}(j_7)7+A~&>&@>1@R0#b{;{`bFen={9;9T2-^x%yAQkw0{Ja3F(t8tZ zU*;_%x$It<)Ka&-YdkzeCC}q0tOCZZceuo_=O>-XR6YK!!QL%@u5Nqddk_!6T5kwK>E!ta>u=O5Z^q!p_mA&WpUv{IetncXShzJ&;T=a< z%KX%Ew-}_*$`luIBsKIEKZ~Y^t8yPsf)?%fP)7vASlPTiS9{+~0?-s2*_&oePycQ3 z@srTh{Uke?pELFwODvki?V5k;R?gWSdE3tWhs{4E*|DD~4H|2zLKqI@Hq&6%r>{To zGmzVawEt@PdzF0id4pca#0Jjz9wc=u_RM$LT)oz1OkFhp$6vG4(A`*?Cf8DJr8NoA)nwb*b{LzPnUiIP z4UaExNdgt;KxJc=JSboQ_nSJon%+S;M0(0*`ram~0t zUw%R_9azf>eQ>Z&z0@tEH(a3~3Z-3jMv1eW_B)%_JCN%SmZs~kh@@Xd^Mbz$f|{DZ zAp#-Zf0ZVKP?)vpxrP^$gUW%1r2HmK`m z>L8fRyviMGr;||Ypc!xXD?X&4W@#R#(E2g-alqB3#%DncK$tAHO^?+;@hk{Cx9kQ1+k4zp*f!m|U{n zn$O+e7mXv}u(z|*douVVDTyI=h44j(`yo9ny0a+tK$9aCjH|*I{A?^Vud)dtH#g%W zr71~*(4!bcfU*O9GyDNRS9KHCxj7Gs;}JmWLXi7e@)50@=7E6a!Tu76E@>V-u*Amp zRY;$`f(B^z@lRtocMmV@3O_ZU-~?TZ# z2ZRJ4pvzLZS{SRi_-p;^)MSGf=Z!F55>cg^{F~Ifeu7oYBgXgwNs)b&;yyfv@DLd- zQ|mDwMlwa_(_n)dK?^_}m-HRfWo9h}KMRwqe@ zXkCR0Bdxz4-*at};_lH|KQ_=$Pz2f<&rEC#S#^^-^}EN5mba%lS=JJlMNK6G~Yz(|1 z+%E{jp9{i$rdjL9zUhXX<2c72(YFmI#1y3Vm zZEWyTMK&SD>Gbfe&U~mf!7O9-p6mUKifB$wdJmG>N6N7O0L7=^~L7Jm`4uiH;hRIk2Q?Sv0hqx_#_odNLX>ay`pn@l0aHC zt!zSP$e!GW=)I3NlgO1xNV!23i&SuO#WR(vGs_$Ug-| z?|D!!Srp1YBt6eZeXO|1ee-lX{Hhab;h`61`KIFRf_^O`*rzfJAw1NW_n;*2WPOFm z8jy=0Js^mqiiIm~xq4+Y;rkkD{!a!5mNdTZwxvLmy{Q+i9oLn^i z;YGGX`D-_nDc>GqIG8nPzLW7{8A4fI)4za(y0s_e-kkL2ypxB)xY4`;^@+xK&A7C! zaNJ$S%)fePOl@~CUFu%Z`rd0eG`nW|;;r`&;ohz0=rQ<-!QpB$3<0VZ6HvKlTrgi` z2Xf!i(aEj5$H-{9nn=pjQ`r1jrr*Qk*5oGQ#-k*l$RL;$ZCtp`BDe!V%$K;WidD|v zz=@juy5VqQTNoa0yQenJ5=R~>ZIRr5LBY4ZEo+0?^2^Yj1+!^g(u*N$^9LCO1cHxk zvw))Plltuol2n4KNBW2SoOC^j>arTdR_m{g1tgJ?<9iH0#CU}zwUWsectLu*KUa+3 zZ|d1|!eN1a__b6acVN~|VH7|m&|N^3@0A%ae|5G>o;R*CNgfk7r>?88{mMmy<3z^f>g^#ZsgrYzs*M0x zTe97g)Iw46z9lX4pE)L;WRZRqHdTMQuj1)?Y&-{-ysbXZwyvU+1r_h7b%gI8q@3a; z0ui26gd(>0Mqe$u%b=7d#f?w0+i()!+1UYk6c;__Y()T$iwOa2;>>JIu+NoW z|HD2Bh!w-*E3S%(y%~8xp=DfL#dzG@MLaWrY=rNB&G4L-k+sD7_`{zC#I2&XW=Z_# zLYER)jUwTfMJxj7qw?S^ai-TqZf^5s_VK0QEOf*6arkHTjsP5heH~AMh&)N#5zM|4 z+B#IXJiR`cu9bEo2SGSybw%ExpA}1{CiW|=NxBmyHbgbIUuBV=*JgOBmHrQYntLRJ zCP}@RT2~d`Y;SMR&zq)5#m&vlfmk%Jq%6gx1b>8RiT&3*p*523Sx#BpQcn$=CHgBu zZ$iaRKWgn(_RW}|>UP=Hb$=*x&uZLwJQgO)`~D0>h>V`L5|m<;yl+u*Q5H_8Jbuc|82NVrqV- z1-z_EChkvNY9B|P;|hc4sr)DSd#U*D%;Woa{l`nWl>&^(u3J|zCNbg8UEVuhj2v7o z*U1M2D@9XSq0GSSUwfdL09svSMiI*H8Zf;zb(mC8JKt}*uxU~}=@v@#B zo7u38Zp}k=*O&}EkTdxJi6|G>Rd$PHiwhDOvd6@ZBL@<;M#sl~LtkFY-RXeI>y>y< zlp{RDq~1z6fc3>o*5@Gw?BKBRn0Ry7kahKRVj!p>+iSKo`uic5CK>!OSUj{5N8j6T zdQd4DBu;gVu&zNB(zi+{rN#~p4uV!c8+x%nGXk~2EBzQ&J~&2_Mpj9ORcK&9JQia~ zF+<7NqYMZ31?YNgc+0bgFGcQ0iBKAX$xiBG#swF2?g|}Q(Je83=-^N?X60D<&BDS0 zv`WAs`$>Zf3?ewACD`A8-AQ~s4;%v}LT9eG>Ijqx^zZ%1nVvp=Op(Bpl2LauIvUPq zm)NDIsfkAmSY7#0x;ks7n>Wkez4Mad6u%FQ(0A{)78$p!!B0=@WW&Id79Da~8*3Wh z>QED}=6|?8M6KR4(!fGdyTBW02*HA7jSN6v?k9?FsBoQMJy@Q3D$T=&KTtb8gTwyqavl|Ts7?hArK`f_k003vXa@!89 zbU*bH|4XZzGEjN(M?QsQUnWAOew7K<)kEH6Vzh#sdcx&7Lv2Z)LJUP7@E(~V7((Zc zR-l)`I`(}3A;#MrgAkhLknSGx3EY$Fc~MoI#z3=NS{zm}u;sTEm8=i%&q1AR@m9hi z zk>1`n;o*H>zkUIdg2KW=5-AiP56|=;X~6Z%Cu zaZP%kO?!4JM{Q_A+MQc8MFhp{b4#9CiXWr;bw8*PJnVi1a6NQe&CZp}#_mi|)=+;N z^w!gNb83;_=c2{vxXpTM!X!*aeI;^9JAs6G_cQ1_xx};5reE4EN;r-H)oIQCOb-ljk;wClNT-V-v3-v>?C!1!o?N1chxwag3Iq-Jdj$V*!XD4>uzS!>ga1$ zrR)i;(i@`^WQ@OwY)h^VvMxexHxrqJEw1pZ0H{^=^?|0p5`1=tIA8N|qZVug2aO+jYjZuH%FgH<1pB8@k@}=6xODDxbaT&R}!FfS+ z3=BXHa1QQDo{@%FT3MOJ3TZrqgoK2EcN&~+6(+uk>y}$kpkMYpNu<+HUJmXpm-&8o z?1}bF>fW+oTLIyYzWb?_oq1zk6=Ll1WL3#jZKAw>y)wPz%zW@ivmI0(5KTVM(Fkd~ zUH{zEnvFE_Ao%j_DSxF#$~tTY{?XI0MIin;fC219YQr_;7 zU!JHWBCK1Fz1xS_)k9QYvc`hIQQ#Xt=&70>tE1Zz#<6w5aWa3a>NxE z7q70aeyfBwlr|Yz*x0n#RO{K;+ZT`jn70lcMe;plQTeGSXuNv;dVM^gJj0@I@xTXG zmb6WI-dY~4B|Ada>Z0wGCsGjy?`>LY=otxwLgutZflL*r=TeE0Gu#`frA1i?rO!9^ zQ~X9l`1w1&ttA5#ijM=QvW4xr2Je@WffDB}S}>P1Dk+!}! zd2|d%Jx~N3Xjs->EZi-txHwsCDDWLUJe|1nF4seUZ@9wKT#v1WFTzx-&IWqh+vAxt zv;~+m^)IRs{B|dvAIRA}W|N!22JOBvG3JmdUoR&kUsk)XfI8OX^r(ov^z_Rv1JCI& zLb^;q!s=ObHMBur98XOsYNvK$ES864*65~JFH2Eu-Oa#_>x6rSv^cUi9Nn19?Lb_w z2yM{WpXb|+AmZS?>Nuhw&c54;>s@bCX}vt(r3nDXR6MucF(aTTPP}>JG(H%uzUjSrP$~8H z<_hBr14mLA%@ZvxEj2YtxiuzYKjWbs)y$Ozxd5POZZiVY%Hrl4wD}=dETT@r@<-Ub zwKk8UH;fhdBhowzR1oVHl<&L7hoTTO=Db};+^dcEG`#p2E#TgccpWO+Dam}{2o*9w zeymQr4!Qx|kSHG?riB5CcIA}TUMHUWJeA()`6NRi99p}rWcw7O0Lmv|kH`J&1Do6y zBvWPl)nx69=<+Z5j=sIDZ>?-SV z|64#~qvJw7`{L^Ttr6rCb#r%3vNW+F*v&;d(Xa`<(nJ5y}j~`?wGWKrk#ZvQ_St1J^3>u>b^EB%gYKZ zoN@C-1ySWWFZOoH-a(25x1<%sTj(Wg%@3=D1^zFY!WACPEY3+FA4 zSIb;sRRCIVU^G(FEPd^R2yAPtxJ&{1kY@nr%?5ba(3?KAfV9i+-E8YMAb9#_d`9Mh zitl3g;dEx-Py$+~=KtZ{ZrZW z_(`1;I0Nde?qh<=347ygTS7Xa@>Mkg-*qG6`2@wt9k}l%nsq2u-1t6k?5BvZGusb* zi{o~f^yK4QMFcBmL9Z9akot+I+N286uk9xdcsBlx%n^fSv}h%xYvHmg1{Wvh7>2#-)Y{7R$2T$ zZ&y#%^eA<=8s zdNyXst-hs1b29pd{p4W?UPo|%Iuj@W*?D3)B_&}KCMXnF02c*Umr#!V)k&AN4X4{O zGS|rYfav|Q083~l*qe18e-#p1zhT9DH&MNcmzGs~8pXa1@t*v+GN6yNKbhO{-60TM z;|~c;J8YLK)Jfz}>j@5RoQpqT|0(pp%s$Nl<2Kztbi3 zwe!|&BT!aZ1%@WGo7XCg%qVK|U<-Ky`vj5xbj!pPVp|L?d`QZ#R>LbIwuamJ_O{ItK3{}gF40UT~yrG@HreyLiBkC4$g zOJ0lWgvn#6o|Y$$3Ag^{Z&CiQ-+#;Y`8Ih+U)OXn=3uyC3Yv7^Khxu@)#^ zeu6A=3IXQ;I{jN9f<`ZV4L#}r5>omjTf+5&O?%hPr@Z$RPN(Z*k(nEQ7 z;oGwYDXGkhwtK^jrxX(homdEx-3n447ceFlfP%UI|51}=w0@wufiH-{ybH|6NFAY)oW{iGeiF?97PTp6~q{K7`gW$Sh-qIn)=O`3nLjfRPuTOc1 z2rC^?Z?Bd2LMu>H;Vilgqf&G8iLlEO12r?u7qy+#Y-y;D{Pd2pUoJyyj)xsM$7}Wl zts2UPR@B7F83{%%jGrbPeOKb#Ymfya)A;~SQZ`*Wh_cFnvv%eup@^JmAV7J4;n{P^eC3vN$%^{U8O=Y|0{n3w-n-Y?t%pq;h!DXLMCnmHN6!Ke z*Fg~11=<&{BxQEt=Wbe8)c=K>yyj_~HV2`XNh6TE(N6%G@JB=ahtU4gbmcmM+%ANE zZBjl6kzl{|5NW;7|CJ~uHG#cdfjwWU6WOB|vELphpWLoN`i6m5IR8*@(A<-U~n!6=GU3m4eZ?a?->3$FNrkFPt)8YkUb-EJ+7(3vHdd8^Mq0{qQ!fQfqwkN8>-Cz@83XFTM+& z2lP&g4q}y^^V$FQSHJ-KJvh_U z$G_ffe}EyQUUT3I?v08;JE~&p2txZcaD@axE0J#TpmZtWoL5+nPv>ql3@a5%2Bm;DefX?(sM+kbd`actvhN`*Z-1AozF&9tQ)| zHYX>7BG#ikuFERr@n$@`&Kv}&V?WZf5j+7w8hQ8s&9wIfdR-r%U7}~Gp<$HryY@_O9s7s*rQWqHb=|5g6{Hfpb zgxNs%oNKPnFi7*LNffpB#PN46Z1eb~KON$L*yv8v-ImD$mR<-YAFzB=3l^{MXnf$!>D5S4 zq*6XbeB>|RR=)Qfb(dg+6k8e?8La0LZpiiu&B~ki?g57i>n-U>dWsUduP(dmni?7^ zk2WdaLn~c1csd`cK1I~HldSrg7#cDIkp%ezpIV1`Amd{{lTx8IUpSmJ&tnS0-fBc zgBPcPa-eBlzD{P#0u#GkNX$7f56WVD6ITweXdtMYo1(LmJjy!(Z(ufK+F~n+6fB}l z%~#S?>IEA)jU0yGlLc`wz%$>I`6M`rCm$Xjf)j*7qfHoe`^pxC+#Hd-8TA|?fTZ(J-(3AK4i_2>syQqJ)bVpqymKr+DYPl zDR4~brvq>3O+kcL&bQIg(iclxYL+<-DqhhvX!A5_WS#JPuxNtA;4SW67rx(n>1JZA zCYJ+fg4fr9N?a2#m3T)|C|f^!%VW}iR)DMN2)uR!rc$}yxacTxb^TMkRXn1ki1-Sh z!yt4oU8qOsf-jsFkS8xVTwJ%oTjOY9prMQ#NFvNkkb#$i{SvQ)IZn^^ViXgFzL92? zk0517J{fOFk5>?Ieki(q{8#4S0hAA#c_A|hgH)-1){R5)>be7uF@A#m_AeE&MWHk3U6TKt9vVmx62FWl3Jjch1&IRa_tp<@4G$T_F` zbTP7}S(5BbkjcooAZX-o!Gxmc3vpbHzly*64Xu3py>$K>IK@l8YWVFf2bzWFt9v3@ zJ!AsqnS)oMd~vb{l)d_eY?nWgkbkYux=P-O@)APfw{W?Pf@*5}VRczAd~GSszKHvDebrToP|T0) zAJ@O|c>g8U0-YUDaTGg-z7Fe;f@Sa*BkGtE*l!{}J)I*TyxAFm8zT`@NCLLTHw*|e zVhc&fq+@+XhPt_!g9`OU=xBdOc?urW?k3mU*e4z+rGk_EOM#cT+PH{Zd(=<*U*}>D z>>v;BJ6Viu7k;p{Y1^pSSgMN`M}O>o`671L zh9*F$qPyPzf%kDy5sTiCX^E;@C}zRX@Gyg@J!YBb#Kc5kApS`%uwPA$gR^vALZPQS z1s<;@_7EO$jjJ22E6~O@fM%}8jr7}JU?9T+#&#J8IOwfco7&Fr(^E)@ z@0hwghI~=(s(xZ$;3R<_?Zy}hhP@%h%Se_{c8%{Eh08>4t~T49UxZ_8uiT>&;F%Gu zxaop@u>*%O?Jl{A3CFWsCpux)m9_W*H|(9b{FKqT5vjWLCYyT-9xS;0XK4A>;D^Ux zLJkcLtqkXV%**>cW>p@dh$A^XGz3!UftLuZ(>!`rB3xXn4J1;*K|$liz$)MUq4sL5WRyJZL)U z#(|y#Kd6nCzn=Kl2^0;mEwuuA}tRP-_&7yxmwv8SL)?}WA??(XhYE#V+c zU|gT=8F(;pSW3-59K&wNJ^-D2U*4^c)qkv8rVbb7rUJ8`$mqF+5dR;v%XSvxCn^+( z9gKhEx(c_MO@QD9E$&iYy!FHBrwC|Mkh*DwQ$}{YN|Y0Xw#<74>gc?sy7bc3EDOLY z7~kvx&Y*wov8|3;9l~i=SK6PH8S>Za`cv(Vj!n2{wMqfNDp9&?bv2cTjZ-jrf^{41 z*rHJa_gKb>N8I#ocSs|NpYT0>`gCk;tP+a!wGL{kALiS2scvg)dtR1UCTaw45DK|t zA*mYr2{GN%xL}6`s7E)&%FES)zZ!7^+T_LEADqn_m)tZ-90^nJy{3H2{oRnp`fuTo z27xa*?mNr0PkkEME~wUGoFxw;86ZRXZ_J<$7m1b*A(%Vr7l6R=-8=l4kD2pSZxm1j zuXJD3J%Y^4f(qX1Ym}@#S`7cEQvr{}Ti`OA2v4Z}I`eIx)i(${F|SKW81rpETp8a& zjsm_rgHB?_b}xo*f7^V-dF(TpG$xghKzUM3OiXn2>u?BeOmrHkC9yC# z=RT>=u7#DA&X*S-oScmZZd3rI4|}{kb+go(<$~;#=4ep<*YWXdXpHmw+*}>rTM5^g zu7%sqKAcDURh0kMC;c-keo3?68sIgC$iP?^fwBTfYm7L>a1@R=q^7;V+xgmuA&$1% z?f~N|6!-XlAd+0BCGjb`O$7yn2(T{2%gGWRJYJV{kDyFnEQ9gIDv~!TkAiQ`w@= zG4Ay!#a1awtfNfwNbfXgkR6au_8WvHFkJxR>)#$Tfc}641hh3cuw#zKa+!ieO6tLt z4$KFFAsozjmVz7aetts!#xMB~BU&m5f*uBG$ULRt2AI-|(RL%E#FRo(LF-frDH$|E zM)&^}U;v^I_!g;xnQ*I(Y=6nZ6ZHaf0hW^OD|UvDPPkprm!@6iypA@U{Yw>|kRl7< zBfAP6;B+m>$!TwE!`rRU0jE~7{)(P@cz7xD&Buy^a(j9VfU0HAL-*@_Sd*RDSj}D+ zL%T*n*H83U*U!!#075fCIE8eb;>k|2{j_}&JKN~B$*_XYs%tgWe;HIBwIF*9Qo+Ui zj>phLrVflf6_#&m?DPuL)C?Q5`p{nywFOyM(0;V3_Wb$0F0|`1UDYE^O%m4O$r|}N zhy~qGF(6KkwuZ*tNB!cXia*s5S7P76%uY~4Gsjtq$HJ1ZP>!-*0O|jc&%*P@ z&VS|>K(1VQVk>gSecVG&j;8Aq-e2Sc{8Z2E|L4QvZkKVOd(}ujN8g_^aN@xaYinC> zJ9QFXI=L=J(JB1KU(1T1u5YCPIxV6QtlX~>z7 zVgWPjT=GQCp83duyHuG3IpfE9yIFlV>wTNk=?`4EyF^U>Odv+f)6uOf;fi^}28+-l zh-I3kNKI`E4bUXEmT=v_!-DZ&^e#~mpm(RJBT%7-=m!jp)oMf&zF#p_v0cAjK*JTw zwx8Zg97*QT{hPJUZNCW;vG|2nHnz36lz?sit@r_LQT85>xQo1tM(sIAV*=8U-e+yE=sKobCNT%jwKV=i<&gXh>hEpoWBV0Y8 z<5%CfC)Z>CH6pJ&n%pN7@Cyc&I@;P7!aRTl+kR8<_*iB?s`ciWm*2gb@TM_3HcLKg zSNtXuNeJHR$0OuvU6HYej}B#r4tS~P{&aEi%71rpUGftK8NNvi-Smwng+%PYjF=?u zOcMcUWTUCDa{=n;#rRj6#8n{J1KLYxwx}ocU|zO$`bBJ+N)EgY zF=Uqcr1!a>Ct$xJ^uekSx;N_iiq?0Wke%Qv=?$+`3zs+5)2n(m7{t z`p6v{n8wI_L>%^OX6~Sc?#a{sup#k`Ko6~C0526&@wkir!YWrk$a_Q>sVo@b-G1CU z2el3RRuhx=y*;b$I_+14rzi6XHOat~F>@le*DQ}$j5`&lduhoUXo_{I*64E1Mc6CB`t0Wo-UGkl~_=lJ1IkY;i-_!*d z8f?8aoQLG1qj2S~;8Oo)8GqXNrpQzbtobB__Bhog-zHTPrkn#h4?b%6I*ud5 z`>a4vjsDf8s9pJAT}rU?UI)?une=Y?3E0Pe?-C$nqK<~}PPTXU_3zOBvbB>52bIFd zq&;5bhPgsO8$0ee_zr(MAch;Ts6T?)hRH|`J4jkz51%H*KjYqhGY1hP$ZV<0{_C7QSVy7DjNV7| z!9w7k4hU8nM9lCT9>Jn)Z*D=dF9Qs|3xLmeHmd;MFP)GR6IpQlT|Qfuj~ENzd3X9w zu!Qt6vwu)HZj51S-r{JD-l`FJ>0!1Hb^y*yoOOO7#vmEb6?U>KZ)nm%FlYS#=X%_ie<&=q60Hssw$XcGmW2~L z8VQRgv137R1waXa4}9OhY>@h}3Ywe%r~OFPPhE*6GqVc@JlU;3W5Q&P?JK%(l}%(J zrq$a_M9zG-7IS}RK>o5oJ{*xdWphH8KdC@M>8{G<2}InI3rG73uL9H91h084c}RV? z`Y45Rkx4YjI!9b)`INg{?gxi2uJdAtM`!=TVe)@hCm($moXuIq7 zqHM1)sl3u|c?rgr(>+mH1)&8v3A+Dm?x?mE|0A^WemFA#@e@r>_HCjLht?jA62Yyo;-RALqt6}RbO?F;ui z0ZNU1{8sbfOe5vIlCe&%Vp55CbxqNrn@XhO7lGfq&CE>Bn-AWK#E{pqvw~vSSZ{6( z--Q!0kTj;`q?&BBLqI2w3`vgPQNGaOC4I}7{i;{52s+CQfu6JIR>k$N%?^@IbW`&3 z-E1JFVX#vUXv7zAyO3!=I@36_qrX3OF(CcOO|`=58+A%8^SG-?w@}$W^MTKX7m-3+L{0&inQvSrq1dgT$i=cO| zHgy|Cx^pBb;-ZN2ZvHn!tqdR)ND>bLi`Rmm zzajb*&#>fOX|9ey3^H!&i=b^i;8Q|wq}_89-M_-^sOEH02Z=2?L*H;I2{?CBV!ipL zy~=^Q`iAg`6|j!JpG)v@XTjzEV+4_Sb{3}gD{dBqsnzQPRMf4eRHra5s7Kv>tNV79 z0-@W=faOoUx8be|*Q-We`h^g@L*lyl_16j}$EFH`!*AJBVln()RW`P?;;LN;Ok!}B z9~}ggE!I}wRi2iGLGC)#XCNJI*I&EDE82*l2-r!)yL|-G!p@@Me!3mNVEx_q7!j8ZB;dB#!!aO z8?UY`$5`}L&#lBR*F+krQ?H~R-W1LDg5FM#pmkWw!n4Ym(@6cT;ZvyEM{jP`yhoml%hESP2x0kEj7`J22LB-4~?#^$T z7#&{%!f?=DTXZ5NAlq>X53vL<^b3}_+5a?Du*>-}MZNk>SfgD5e-YMa>^w_PkHE$K z9Ci?exGrAdj`VgglL{BNQTuKUs^SQY6yQCO3a84a@L994w*IlSR5WJgb<7j{F}1eE zb%TS0<6U}&*yT>-TTRi26$8&L#eYbCJKAm*`i!JXJNHMvfQO`x_ulV^y}thO4A9Jq!lNJtSxxXr7zso$u%U7WsXqP%U;)OLPqVzngoxm z1MumC(AdR)n1@|EfYbsX-;qJ$aQ>a_)uP)1gq{{>YKQ(a z8aaP{ezsln>aYHEcl%%cDYH|vu9ZL2r(2emoTwyzV{Gi!voNwX*&xadykB`_=_l7D zB^d@3lEhS3bucr|Gla1U30*91P>;j97TRRwyfdVJ5M@L|OB>6h`|Pt8J+a@th+0&4 z>fZg>Enc8jyP~5h(hA}55}BFyF1-&A2&U_Ml25& zBy>i~_8XwqiHMl5bG&_fqSV@%yaMDO|Lp!!VoI5=J^ju)GXj_FMM!;rQ6p1y8_o0l zUQPh>TmbF;??e_59%Rn+;Qk~?AE{c&r}p+%2lR)Vd#78L>WFeQ=Z#vAR13UHBP*cYrJN1~|fDFs?ODTkuCoT;x!OS!u0@`)0 zjQeFb2&ZNbln2QUV80>T_{Uj|m|gmvNG(k84v6y>-$0=$3%Vq0WZyv%(%8~d7cxCv z0FnXcACCbIngkyo0IN@7IBd(wKz@#2t4{onVg|&a{K}V+^w=nxdqq@J1isO{$k(sH z=t0A-0d4bwQ63px_X~HKrdOt10``A3{1vULKXqqVo)X|S1#)LVR)~qwlzq&}5fBuN zjEn>@Q<5U6q?y1rXurH%Y38$;UN!rOdQrog=G_~M0(4ITU$$SS0BQG&_4|FyNs{il-rqR<661x1NC zaQ9ob6%u*>ib(_lwO5G(dfv@zY{W7;HG@w>I(7LE*eG{8eAr0WV{-37MU&A_5U(5`@G(;LGhbH zZ39~TMw1!;G>jqn`@=sd{FcFZdZjb2dECG~4O#>~*Pqhx2C=d;$2pd3X%&KOyIG}0 zBku#TJl=~ibeztAf&;9E0W7{6cv44WQoGx30XhEjX6!&N_d6%3Z4uhr`aeY5*~b48 zZSNBP6m2^Ydb+;^IcXw}t6^_CjN%_`m4M1jq!%=9Cak1S4qvvFy9DbP7M*V} z!-TaoT7cQ)mh;vQbjb<|9v-J4#HuAJ{x#^$`AkefPQB~bALTe*1c5;RowEI7v~1Sc zGqcEzlNIEHPgNM9k%qW(YD>AeT1x&Xy}q&5PG{stMi?rka3k;ZXx<;L>91?!k zgqKxjfynIO_PRCxK-%G;wZG5@$oTU;oW4H6G{!&h14?-jzzy(H_~iP|5~Pk;<-^U2b=86k6RU>P9GbqoG)dGrN4mLJg38Axyz#RFiB z0?j5S*k{`(_EYL4gKqH^BdCdRs-9;@WEeK20Pj7>-{w28N3*VyTLiqTwjj}lJf@sL zA_gH{fT_DD&v1`{q2>_i5<%c~)#WQ_n&z+O`z+xx=6GQMxRQtcZCLq;O8Kq2`LMn4 z>)o0+A&b$e->Y5AuCpQ>Hoq^$>gW4j5BZ6tpx-7oE-o&1_Kk`v{`Mpg9!W_>brbxY z@qBT2KxCd?vj}}a|Alp+rkq}f&eTD!R~(0qep`o`)J!*O$oGE;!Uzk!n? z<|%KzY!tv18ui+QzL57uv{I?ivTwsZ!{fIm)n!f*t<6z$r*x|h8;QNz*B^I=SHb*3 z-0}Mj_C}x4FU<@CRy3FBpFHmc)rY-Ef7i`S%Vzlsswwh+3X-b@2(n7rN#w1>zJB}q zrB!kbS_PY;PTh*l>DASzBBW^JMBBdVOz-|aT`bej0~#W+bjYZMg{{B7I6dCm+uj}= z92B~7W4ivPxixs`R#v$dCNHMpBkF&$D4{?vukGTT+pl1p{dRr*6kc8FX}v7AeGXRe zL*yio0-}gpM8R~=nIw)u6>oK8_x_EzSEb`x%EHB|j;c%A_rGM)XsOb?5wj5Ly7f%; z3ze4jw=35*;zoyaFzzS&4yv1%MX24m@WnCUp!nYUo*7+>KJw~<9n_4qS@>l4Gz(Wi__Utn8 z;ltub+*y82F*8@=kZ%O7D$IDtQVEXNJNiEuj-4E{oqt_^qUd?wxTeHqXZC%Wf|(Y7 z8BWfYU)jK#-Jqe0lzl`y^qt&F_RvaOitu>6j{diIuidiZm=JByYxXP|oQSj)x-GbM zYR62kozTP~#HG6E^A5ilqf_^zx9>OUPt560xaPoL&G@+CoWln@RqJCV30X$%{6`=3 zoaZjBF)zNbFfq*xh zl^d+(g38-t+IW(}L6Lx3vq&8%-SX_Bw^YD()gMvObs*9WWp#9a`p8jcoWnZOH%6<{ zLe_Q9rQ*?M--n==Agk2zR%B-Do;TAP(NM03XW zKYIVT+D1QCQQdVZBKbwY(Gj#K9=V?4RiBpr(#0j&ZN2W4@3I002C~n_?106Esrn7_ zBE@&eLH?cR?}hRU6flyKAmd_U!UL*%_hzM8Nm{_awU>UjW+J-8ElX3nw&s}e2(~;Z z9}@C1DM@>w6MC0+c>IX))D`bJY?!kEW!n^Zy+5vR!Mf{ULjEqncgQ(qe>pk1ci4TM z0f{NV$RLn4a1SFjs;9X#e=2gHFn9k^2XxUpe_n^kEEVY<>mFfM@a>ZkBAi~xaLMAd zskg{50V$FtF^Tw?` zCih^mKJY-2q}(y56@9@xNr>8_|Lh{+5pS~mz0Kj`fSxtYlMG?^+gVB_-aJ>c~56?i3b$Z#Q#)XAk-OS-;#3 zgU%mzSw(e3Wx;gE;h)TKNF>8-SsmUW&)p&ma)+s%+~r0D^jPdn@@of3G-sEwab#=| zG)oB{jB8^b^+$i<(kXl7>&9yNs2e)Ns>-RP0+j1r85)hL#o3+rmX z@Ey{Gd7xOs8BWXYp~X@?siFD7U!U(Jz9+f5_O71q1~I=^_bLl-@*YK&hccx*!mGhtQjXh=PCtLkmTk zA|SoDC`Ed)P$ME$Ktdp)MId|!{JwkddvCq<{(G#IKR7EnduH~`Z+?59J#(b?E}y61 z?iW>}NdgM~%%bQwpPw2S7UKK+8)j)aXDOaDPWA>2Wwprqe`MJf^%KzV38>N->C^-(vCIet+Cl!s`OZON1=j$WZN2!=F|FGqW4Z@gu%Y4815iHT>@mENJ* z*_r#L`b*q1g8wo8n?-r!65qf*ac2s1bGDvDpx4&hot^kiZTlNDE32#Wh~>Rkx~}8p z)`ydh5%T_SMYVn;5;3Malvn#t0w=|TObz$&vFPf2z3GUT`;HcNuPwvmY|{{W$oP<> z>S!!03eU35Tpr@GqQx0NF=^Mf@#K-H~WVHOF{F3AA`7<3+IuC3!;TuJLp-3DL&dG)JRoCT}xkLxcTHzJk~tq z&!cPOCiSw%_SjmI582t;-66+%l)(DPF#*(=N394 zv$Oe?4!3+ivA>S+o9p~(0Kw~J-`d^i7*@yebM6837hav!``?VgM&v9RZ4BGDDqD2& z7&|2I?mH_pqVP2}x28UQdRGgtp#^m(#C6rl`#MPzGqpvhxakoJZE$s2yo(o{5b@`K0acbDRpqa8-p(?O(7DqI`;Lhgw9^T z9Bc&LgvC%wDtLO#Of|0e(?hyjmE5tUck5)j;50*C5O(3wR>cH8L;;6kxhc&dMJz$# zESsIrre0AAJ~oWxlDs5xmU5`3ewRaDT1|p8yp3^`{q-NTIErLNk_qNlSd}kgi!+#I zlJCy?#Ac86Q81E&87=S;e<$A|+&=i>S>-k(o#*(4W`;)Or!@SC^aHxgTwLtnQl7Qo z20Et5lgmk&xP(>!vLxxW%8Jcb6Xp|vY6jzi4~g-J=o;k{WmozFo(1=V21E8}W=4Vx zB^z={Lpj&Wxq`MM@@zAV>x{-Ni9^eZ>oo_VF=z*=nP;NrG^xyD^+eEue0mgsK)~RD zxo=84d~a(hn6_j4>eafeow_B`w&zPm2u)3Lv4_;Z(bVR*cr&VXV`f&bI-@#A$8~nM zrt$tc67-)z4T!Q8Z0Ab^9YlLB?@|Dw=+ZeCS%qzDz;qzwd#)a1$fX#0py+D$mD!*S zNO%WJX@2qK7JpFmJVh+ZBZiX|F8H?|IIW3E&KMNKZ|sBPJ@>jtgdLnVLIGw}l?-i< zWm}T4KKA^a>4PSlfffL?4(C>hj%^KFy-bEQgscG|_ecRQ(34$M2yV1%C|APyUzU5k zHcMGCvz|?xtVm1@-&)<+u)jGeR7ayQv-=XE-tUf;{uVHUv1$!YcKaL)BpkYbz$?H? z*H>18k}P&|BF+y6IDy_DoqEUx!LoP^iDo@wk{WPq3+;CCOG*YjbqXtU!^$#ZX@CSc zyqXIj%W;aN@Zo7CLEZd2h6A$Kf;L1TsUjDwu=_WTo5&q(9KyX)1(kjgW zy7@RQi*$6P<{PpHhr0eG;6s0~%lqF_#XQjvow0EMW!KMy?+&CB;piIp%)!zj&{=@v zFNr897Sx&uoUcclRjT9A&Rt)vOkjvdvYLzX2VJO@*jxC7 zR*qT*i2`7*N;UrrYri};3%tQ{dcLS5FMI)J)GPI6Z6@n+$8~@Qh!My;V8L4h(UUD}ZjEk|WMQbT1`H@kOI=ie3DV{E*XGV`Gcut#$p2gto0u7It=zCuVUqoE51rE30I3YUX!niN+_LGT8u4A^9|aY`pKVtI0dn`afr&D zk5T-tgx#-(xci4~^Tx6Yt}S`>gU-{@;ij>}C^nWoDmh{QS@_iMl1Q;K5(H2m+FxKx zl$K`GR_jb>MN>A*2z3PB*9aQr)+JjW~lJpwBsO9u-y*9JP=_#O(q z+&~}@9CAKjvKTL+vStKGWyg!)SuCXnE&1$eiXL%bP~bxmU81I#!hG}h$_^Zgr4ZCu zD50b{{%g?w>!%IKlPeV5F+hnl_wkDQ|LaPD#0*tT$EkC}rwiZ9!4h*nywRIpDE_7CUeG6qk>oL` z@OfHf*%|xE2ZUtn#d;6sH8QEV6yT99ECjCJqrR=SZsBnaDA&0&J*3{dyqGe$1^310 z*8N_x!od4(41qV!@RE|QdzN->Z}YCzxNaJR-%OP&`ms%>Wn3Nz(g z)!scy8$p1-)BJkXVnpxRpk*mY|7{d6bfT8>3b4*oGVBcyX6}D$QD0;JRyFtHQ#B&z z?I8~8Xb!sW&Q3KuyL`hpIyl~>ikJDZI(*4koK+LTIWyHnK#FXn z-zQ%Bz`pR*E3Ny|=~>3zJV8uVO>K%a-Qm9fLx zwNMR0wX45fNQ;_~b%3;NW2BA-#OaiKKlh7HkIQ~in~sE8%jZAPN%%Q zYj*EVO7SVrimgRu5B(&rdTu1BKcc43 zzhAER_2Rx}n9Egkd{@E0ef{!pLG46Y)OZYZb>sEZz6QqL7Y!sX={{96w6H)2zR?#z z;HGMei-k~Hs5LMrp)C{odeFhu%}pm%N0)0%7O!^ak87nDb*X_;uKrU7x^ww~?Z^$a zcd%Av&;{+NGVbI(kp~Z$1l5xx1^XWbshH``*|}6%gKTyz`e2Kr;(imXv1i(;Qf&Lr zvwUhvyrDBXJ~OD`UNK^=e-6m8{lX( zPxl=jBtTeI3k`t>b9`3$UAC-Ou2@&x|3grH=)N@CT^jqTP@9oernCIx&u!rCJ>;1r z34u%0RZ5qrH+M~7FfaVFhs(3z#04MVh&5&gA2ck)q_rd;r7!5CW|`vMZyN;RF%fK6 zD+{8{h|Ul)W<^jkv=oQ(*rz9Zqdha21=&pb(%GU>D*Fg#7qG(|YcB#)djn1@+6z^N zvm%?;9#5MtF*^5+3bx!oQR)H1!%+T+iuw8Zr#fm?lWxW(wT+D=kCcnq+1XrxURCg? zFrn7@_YT!TKu?;=O0oA|{2X*>9X=p%ztII5t(qY<7Jx1Mf~Jg&i~<))mh%gv#xU4M z`2!t3A}Le|6^mWj3T6N!X=^b2oRiqq#P0ekf zf;)xYxs$QXLRALcQ!KXQe8;p`7&fMBaiAkxc;|KHvxeK^>zNlD(&4UK8m-?}dR=bw zQiCPP*Lv9${wPP)$~dFj$CSZoeIao5QhlRWGFApLC70|8urh`dWtFLpquI38s(jR? zYc4PjeI`mJ>b<^+2|rSUL6OM8gM5O;`QOaCnFW-VGM+o55g!n06AIX-02UV3sxMG8 z=|&fm-`JR*esB8m0h8R_gR@b2E?_JG_H2Vn4$EH~yrWti2+VuVbZ?w|e3CGL!s(Mw zV!x=8!Q<(c0>3oG`zO8M81ga1uF1G>N1o+T(dh)idp+i9?c$Xc?of0d9kEcEw@bU{ z-?&Bp^4fPDaYcf4q>1SVjoyfXwN<6aCh>W{{M#?Jia%)fzL2|r-Lb1yHU3>rr$9xg zYi2sllFX=#tC=rYJZrh8RLk{M-GZe5*pI1FBo4O6rR>SH23qgrSP)WE~&Xw}>q{P@pp zIJ(T2D-K?tMRIO?< z5pQ!g&}Z0?I#+$I>c`_&ZI47M7#>2_7$(xFkv!1&^_TpXT}7a~Uu`=m9d0E}9Ps)) ze_-P`p!zFcSqSl>(bNJw$;Mx3#D`Jup&o4)a(%}PCyPzS+ndUPO5$O|LB(fWx6X!f zrF+__-y6y3vlLadv1e^0P$#MhH4(ie{_XLPjy;|$(b3j6lFl!Ec(lZ%q@;wDjLJ*X zd1X=WfwXgRb)6Tv4(KP$Ow(U=kSkVMX4$Ap_rwSk1MSbN&U9+9lVlrpky3K=X`U|l z#H*bGwgRQ%M=BhGgKu6uCxFuO5h6e5sWQ5e+70#6&b@7mK)`bKWjPX3Qkd7pza7*n zwyO!Npmj>p2qdt7rKLceV(J50fhdRMLaP>TX#D)=dY4s~{>)SBn|H@q^D_(tSy+Af zxjwksal}7MDG@VetxFs5v)^PMNO=2EAoC)XdfWFGYWg=E?Tu8F0z`w*Rt{B32E`zW zpRA;`H_m+A_I+K?WH;G7bix)n0dH{C0ssI2 literal 0 HcmV?d00001 diff --git a/example/ck_tile/15_fused_moe/misc/moe-3.png b/example/ck_tile/15_fused_moe/misc/moe-3.png new file mode 100644 index 0000000000000000000000000000000000000000..77c6d9b6e43ea2c2ef9087eadff6028b6af3f113 GIT binary patch literal 18655 zcmeIZc{tSV|2I6PlFF7O$r7O`E?br?St84jJ%l9tnz3XVOZEztgt0YbWCjyr>{Akj zVHo?GWb7u)Scc*LP}lYQey{KSJMQB?j^{Y;=eeKjFEiuwd7tO`KHukid%e#1Tl!jz z$Ic!Dfk2GfH#LkvAi9Sj(81?N4+6i$|B*Qj0^LMtYp9w$99kSbo~iG*^Itc6&zJu#9Z0mwxK6E!HNBtAFdw$e#vcNK?yr++Z7eTS_$^pn<}!mE9fxWoK*rqLZ+imP~5snrM` zZl1Y8LjyLL=lMNRcX^BM?0)0ob>QuwWEePW(y6flhEw5eO{IwINr`usSWVWfT{+ux z(`;87X*$|2EIq2WC-`~8NEvNtdQ34hKo8uKW4RPanTH53=0edx}J?mEva6wVD8n( z@yFTn;HAioQ3$J}U=WC}^DXnMr?86K;P-AuUVZQQkG;VA9)~l4KtCklF5Jy;|M>Gk zX85Ip`@f%;zQMP*PQ@t1(LFv8T)x78w8ja6D?phxQizMcw)FzQqO}1+(st z_K%Q%bn=g#{D0%@WKTO@3Ehblbp8G@o11r`>->K?jem~uKgan0(qnw@=Jy|z2r0sg z*08>E_;@?KEudk?`W0-3z-M=_MWK(kJ~>;;PMt_CsuS207HLlL$U3Pq*9c2sYptbC zgb%3ghrGD+6H!D&W*xR2hyg;%UqJN z#RK_5o=T$HpYzjmeCICp%jn?=N<>fViuJb@0yMSNMvGalvXvf<7Rwa0!98hkOoBJaIu2HbNGHzpljrkbbnKMECmmODN*gyVaTzt@T1-9SN@2^@&{AgaDS*1?v!nQslN2d5>Z8CjX3jLXY4}KINWt{^l;9DnJV$y z;21U-pn~512mVE-vypA{)wIpO^lMFQ3i!tU>PC!E3PaVC{RD)CaI7{(Si{+~^=qv| zmx1Fm0TDLu!~0napVWb|!?h^|@9-umgTbLO$`kqy2gUx} zkJD0DGp^U=+N3PhjlgWZw5k4%2G)&)YhEKXGRQObfCY+3|&|`3- z|1DCMG!LfP!b%DXMgpLiR|%tI0(^5(oUlpR`^vi%W$SdnY8NB|=D%mf(kk%YJCj$%$DD1Ng-sO8KUsKqE#5-=(P8`xeSy^To7YGkyFp$3bpvKByzQH( z$G+SL7S>uKN`OOSd(8KYYD>XFgt$aI1aK(pr?6o}_I?+|SuI37v zVc!Ydu}MF2Zb8g!2)Dd0M71Dr8Sn1q!#0K!tVwf0sx#4fSELg>lYFrcy`3;?$WwcR zqY1m<4gX9)Nr?{`{6|%26OAjXTJu&{JJv z_Xsvq>ph(LG_v9D^f;Yg=)LcAw(Bqlx_+cGDl zFle0Irczx%PHd9fNf_woHImpaaVfsMM`KXZLI2b?W+~->QZUACX)W|X*Ns@mIKl|^ zdzeW4f$jEsSg%IP#tK)b^C)F7j+?e||7{*jH zm0`)v!%0pvf=tatDwH zzN(9Jca~r`hIxB5whO0TC7cNfsyKvMXNu!b7#n&oD~mPN*`C;DtKk4n-nwF*E8Z&b zQH&!W{gpt@1wjdrhu^l9rt_EM4T{E_uAAw~cvMQoSFZlGs*^ zs+@@n5t$Uk9?tgM`O=jsKsQ!v0G@aF&L)DCcRYZVz2dMpG1tkpdQ3vr^9DC8Ouxv) zZmL{;NKx_XmUl_e;5)~Puj*9kLbg8KyX%*4X4WasY@f#=m3nJp#@oHs0JoNISv@kO zobX6&wWF+JayqP}GffmK?kP`QLwyCNgO@tS$Q#l*f<-mLgF7yg~ z=S^79=zx{JmxTfE&WRJa79-N?@T2a@^m)s5H*``Y^;qJA8oyh~c#UOfwZj&IdrNZ7 zp|)ON%p9S|i3#{P`O0IvRNtqUvkfZ&5n^hn)XG2rg{aEikzhwhpz6ao@i+&VDPh+8 zCpS16PW2SO!dLlIR?nV-LIUi3onaXfCR+P}?(G@#HNp~w<~emqQhHpwld~pd`uR4S zE2V4E)q>f{kw?*okTy%7_+zPE;o(o`?DQ~G)O+o9G84orPfB|Cn&(8FlQNn8ey(yR zXU?r<@o`1_(+`)w3rk^^+(IeU{n!u`lTON0cBILf2OW8?)Gir;9<-|^+Chyla`!&5 z>LKSvc>MQU@tZ%MMz>w{^b6Js|AI2v=>Zgfr+cA)_oBFEYUtKL0P{3ABp4h1#clk} z*d$aT^@411dpzlb=w)@l2cc+C_>ap&!+xN{zPf8bl#HwUmJ4)k6^B>~E)j z-xB0Xa^&q2cz}G_gJENZ{e1L6D*l=zbER8jx`xxTaLK7RW(FsqLWZ`YWgcOPS;e_> z8ysUUZP?+nWXtD2F?V4r&mqR`qbjMm9XrZcGP5UR-LKFk??3t#m3thex7b=BJbmu# zaC+dkyCnhoStLs=bYZGF z6g(TZetPrF;$e{DM}_SEyng?)K`9BzU8*@e76TDqU&Ad1pz{rnp1(N8ML#AS_)q|w zj+7*&DZg}mOx>Ruqp5g!S?$?-;~z30n+6(RG=^?F+oS(d&k2D?2cetI%Jf&I~I z&%+9P=CLA-iUn^bB?+$_2*;#3l_W0g#tw^lOsy?N>Gla+v_B1x_g32J+g@H(F=zYa zH6vMNWrYc#2eEKY5{n;xUrdf%I58LWF*dH8JbeTDh4#eE``PJ=9q1RKddXjVpw)C8J{-qT(@Ot<4K~P<*Cree_ zUh)>j2)N#dT=4iW{L*%jJCQJX`-tId2SIV~ekfaqL<#XH{CjUa2Snkj9{}KL=TK;Y=@o_g72l%$Gul;b^4fMdcjf`;&NKq*gyX)Py zxVDZ(v8>INT2Im`{>Xi^EByB~*ch9`bi0nWs-z{!#BEL}OlhOIy_73*coiu8)!{wa?-oe~CZk+Fgs@4enZqZc{NNQ0DtcBb9`DS!Vf3vQ*>vqt+*% zQLlX6N9|^c*I#)%jb*IwcrcHr-8_g9>E;Kahk-29J(T zqbRnUILt7;X(!kWyR+DUeN`8#ur^<`*5ckO1?P;mt;h=+C0J3U;j@DKNAOtQ$<-;p zEec%c(^;5ngq^5=Vf@Qu4^n$deC2kV3a&BnepP`Ji|P<|lTjB?RL@@;dS#M)>e$%*$%i7jTwqkkk z94VpE!D)Yj%>Y8=Ld~x0Q|8jo8psB!B4+c?2;nmR*DQVrM(o=W|R&+LV2>{)yi;`0R0Y znj?}?ES(xp`G6c+T!w2B7?fy}1_KgwiHlv7PMF(zvx$ux)qU1F3m> z<{DNF_Wf-+eEcjdqFpe9c&5NcB42HU;NkkFsYR*ACpE}IqGSQ+kjz8oqt8n_EbAU{ zYfVc}9)r>ftW_v;L&i=KLGt8TmCb`^#hNxphItw$>jR}sm$u!0xQUn%#jWvMN&Q=o z1Yz;=ZfBx-VXawE&#;ATEevVw!BqGXGOA>;&uyg~&AzF0pLoA=VNB~sE?jm%!;x`ht=wrfM<%SyO?u9W&*!peFUpoYS@*3_k_C4a-&V8mMIRsir?^2(9 zoc=GnKdtW!%S!tT|q{3ck1(S z@c5jp)VolV+nEo(rZ+rFE+%SJhUR@Zc0vJZxB+(=eJ%g!OvjB(Rp#>9H=!*PV+q?G!_?>-b`^Fd;1j3FZ;%XU zv9#rAR#;yE#t(B#&dacII4O4yCH8flWn;--o$xZh!(udRMCt+%UV(u8D4Ge^C<6J> zADpx4ibmg6%roTZbI-gr_SFAA_rXgS)r?O=5xpFjFSpxn*U%m8Pk2b%NN~NM_R0Oc zza&*%3fVCCB7|9x8N3=?+?gUJhvoaNReoulMB4|*_g}eho^qSBmQEh#>LLzjP{@MW zY;Hb3_8kB6vvei}9q2|&tA&sc2U0Z?UfUVo_DH*OyL0-;v2(o_e`-ZcL)4=XC7FM8 zx@Dlw7n4c|cz1XdoKwlxj{zGS7ljBFP?Ee(QnBGBda>Qn+RG0RccY^b$oA6twXI{- zE8RshOqIj~biH_DRJ9H*&OFD>UQSlPBc&-KNkJ3NAx%9$0d2b8`=FH7z#a7-J8yG^`<9%q(txYQh?KeBSTI8(qo5Dg&X)L6h&X4n`n&Yn* zWyi@84FYoBH?VR?6&Kw+VV4%J6h*EaN^~XHu2aXel+5}v2)JT==<)Ut2hFrkQ>WHW zJ02J0%alg37p)@ z>|vj0e5Q%{s%n7D%1^88 zemp6&e77@^d2NOz$WD!L<^Kh>#f-{Z&|ZneYVmtA!$-jyr~iKo;77eZ3Z@M+V+N>; zq<-}Hdl$mU*wpP)0mMyD(!ucCcCyFs>;o2!V_sqDk`5zd<7aL-AG|07l%9f*)m9&G z$?Ka95x48n=-cqk(Fip6P<}bG(${k<(Q{;WF$X%vJ=dT21R?i+^`c%#!OmQN^pn)8 zdV$h>NK5jlcgX5POuz-=OE>8+TF(4FM{#MI&c}w7;j^dJ2<-a=3t($F2}i-I!~1NI znn!yijJ;ym{w(~G^Va?&<}7diR^)XR+sm#=+?rS~QLO**@ZG_O@f@InnOROMDRkmOv+dV!P-?(F;` z=R4w+{tJhy>^Q&IjQHDH&6d^6fy`fP#o%pOUA?x^uWpipy>24<+qK|)!oFPjLps#Tc%G+j==jK-rFm-=j>lIB7C)Abg$x*`?roUIfVikq0RMzf+YOY zv;U!gfF=VJ7GD7{P2SH_sd=`b%PM1yNVH0N;98QYAg+D+BuvdJ0k0v!0xRL$@7a_A z{28L0!!H;D=qcg^GtA$ZrT0~rP_zus!05!4i#=$qy&^4zvys&*q%ZP2+uK>eOgzjr zOD?9Z3H7Bmn^k3jo5IZ@Wx=!S^ptz%6*m`NyzgvKwY${CHa!EqAz6WDArXT`psXUh^CL~I)xA`%XzTax>ImG>E4=+x$ zh+YR*uhYc|^Afj)klc8Ac7nQ|qd6EnWYpL`Uw8iVbc;ATk8RExP#?9WkIwlat+hKD zPvCeU7+9$k(9;FhO{5QyHN(Qr^=zNS7oj#y3p-CDBDU2CR82!3SgNwb1MB7zx}3U{ z)KXPv6f5lFk-xl&nS}c4AbL%4wb#a0RdvnnW!mkXmmaZ+219B223cW4^$o5%#_5;X zzc1 zVW$u>uXNo^9cCJg^h@{!Wt$FAL)H0im^wEE27eM^*I&MNv#>L0dMc;NvvIx8573vY z?d{nB?B;k)PH4w0I~1$p$Ky-?Cdv{0-0Sji6$PNw#ZYRR{=P{ljeh67swR0cww=%p zB(&=;X|D}nZH!>!;0Rf|!(%OZef2Z#BFz59Q=)PaI|ZDa3ZiJ&=W^e22_c7L6`ew! z7t)#F7VvN9Uf}2*(YUqvV>kz{umu?(9ktKLP+ByJQlmNp_rK}#g4Yp|%D2bB^WHbi zA_Umd{lI+4qB<|CS$zmz;>F8&-SxQRxM16@pNsi!JTqMr@N90*u`k7ccgCtn{R(3E z6XR~z=J}5?Cif7eHpX*qqctTIW#83vZ4Dks>yfvQvvX3z?m2ZVdj*`6N%hfTzQyy!PAVDi>-Hb< z?l>!qn4X+-g7?|2Wy}H9|J1<^I0NPoZBe!}Gvuw4)_e#z>>~7mQ#NT$3>3!>#Bd-S zJ{u36ml7_Hl}ZVvQNbC_rA|OA45Y&dkG~oJ7_afZR2kB}2t5GeJG#$a+}X~SpIuu5 zfmjahQwYLBJ2MY~_>S%Ox;MC2+qLKU7C|5luKlO6Y#Z&`AW$yL-m~%JhoHm@AdumH zYrDydEG3+g-?x2#1B6*W3<|%skAZIs%}5C!0L7{8vnWGN2mae{`j)As^WDNiNB&i) zwj$&ti0}CR)UxFn{v=^WzNZ8I*e7V(XN@}P>;DPp^SWc!m0~hCrcz-Tm3*N!z0eV0 zt>JI4?~yc`6>4Lp#6y0hkgd0Ro61-txq6XHYfCMUao5%>!hADf&bI^tKbNmp0sa4P z%6sdTLa)Fkpb|!echCg^u{ufBVW+Tk|i&%&HD~C_#LFLnbh}TF-cqBUT^<8xv3(H4f7?!1zf3r0^-v@8v;_p zAnQ#QuKd+LpS^aJ-vgMpCy5nQm9z&#JhlEDO8hZ1sH;_nxe0E{vGg2_`&eME>%Eot zaL`~2GNhugG*dbWfR=9880+WwG3!S`xj+;H#>~@MY5zK6-(t1x;=49+M`rX-tT_4m z8vP0)VR(J)s((xaPNmc+xfYMDVTaafA^{WX-0 z##*sny5w_enjcC~y2P4Rcf{|Q)wi}Q!?KG5siorAj7Vn+vuqn4xGnw=k$M$oPt-dX z;*oR}EKrdH;^W?**b{zWVmIcyy?D2lChF`x!(h1;UPh#4a^H!RqtZG zsAc|2fj&?7^b1GEk+J7#?>1J&#cEw{NtkVK;Fh_?<;E9B>F*u=EtAr&ZBcyHw0Ju^H71UNuLA`;VB0aLNa@Mhe_CZh z>~BjYvF5V=CHvVu6-YN15n5lkt4pAeUv8C<`pQa&gQJ`^{t};i0T3Va`wV50qKJqLp!tGx&Jx+8kJ z4JGG0-#!H@+h0?gl>v^iGIVe#!g^@NX}76EO4w9r$B&XN&q+_w#R9yl!m}oV2tRhy zK zvB&y@Rk^P*qt3UKANgq+&X^{M#JO8`o-%jXek(B@a?=I?R4nNq*41bK!Dy#!!V77ZG-V5iTRjnU04WBEFk zfMO3K3GElFZha05-3A%tJP}}$qe?1`hH~&h`z13dcR|G-#4uVmBW#_c2^25|!#&?h zR?~o+4Ua+qzDh+NaIYcw&wC9e3!^20c)|UG!-h>-dM`Zo^KLaY8h1Q7fqR`&_&%*v zeiFa@_<=)?v)|C`_dfvs+&IQPHrqbty4qy`x3xH3tc%@=b|Fp;`nlcdZ@3Y8_Ul|u zj0`H}im_(rei0pntAq|tt86gOXR2C@KYXnf8}*odWUQi_gGXakr_A?==jCuOVj&tXa7D^NoxC+>)#7u$e zgeA`W0{e+vJ(c}7@z|;q0Pz8=LZDOZKzr_Y9|!Pds(%a09kJg;ci1k^)pbfSbC-4q zOXnSe&vFY%+y4OGS5+rirn_iQnaWw3&jM5f02!+&aPOI{N3mOFoyIQWm-o8&nGD=> z-sz4Dz~ZYDr+2rBFVK|>9VWYPNJHpQ890N*QQS|iDF_51sf z+>s>8eT91mfaXt$fOMWEkYv~Gw9Mbz8~Za8xPIv=Ig?@$NWJc8X|&nlCj#+vECSs4 z2frvslQ#JmTp1}0FVFpY`M^WAfBEqGR9Xc1ApK;pu#!__j2@KR{BuI53qH3Lx3%_)<5({CNo13huLTmB>20Wv z*rB^X^CAh2B&|(9N$Y09O=kytHbybb&dzFUYs-#!e@Zq`wkWogq2zH3|qNIuwlECNh-==J0bNSV|hb2;ITX+n+0YW^?)#P_N}$+v%p@+ z2|ri~Qi3ZOsW1Q6%byDI&#+yn|P%36TG`Qb7AmN;R*^lBv|Aj^1hgF>Hd`}^yLw@?@5C>C} zq#9c|#PwhO5N{bhIpGkCQN<1uG8A@Fr z4!fkp`XGwAEz}SHS~VzfPMcN4i!#3Uh>7|-JGBw!(H?B_BTdJ2;-DUOEJ95-E*amHWgEg-Wu`dP5#4_}QErr(Pshh(Nsql`tyC7plFbo{1po=$W2!N)h+O zk?PkUW$W;$gl=00j8Z85wx}HBlz79&J!qd*PUKi!V0p!R{wh^SVy$a064>f6lcQpm z?=N~sB2Sru{TGM)zC1fWSL{_Yi@54B^frK3&iihy{~`jMfiUYKRXE~cfvZGDuB&>2 zh#^NyOW0$3$`bY9t%`Tbs$n!rKPu4#8c3ZjK#iDey-H6n9G|bA&NfD2sdFgn8sEXH z2YEr_kU&s43qX#^=gZuq+oBp02}93;VCZQ(^jNh5Dzis_bDA{l0Sn1JIr?HySTXYt*_kpUS{}#?p0W+sw&=$iT} z(XFJ!yZ|wnm=cuNHiYpAP{xoCY?#L3-}97{QSAZUkRxJ)OXW}f-zZkPVjA=yH_KfO zDoQ5SgrA(~Z|>@8SuCPo3)ywM48`U8*+I*!Qf$9Xy?8$K+NL|mDhSU=4GnASLfhW^ zD3eOODu6u+*4M9OO75u%m#;*wH<=WY)&-F(>wr_ql7|pmQ$$Tq&mZRZ5m5Zbh?EIF zaSjIkb~nyp0RZq$qRck$Had&2g`3*J3q{J;x1T;$Z-IU+sgriI2)oa10r9Q1^b9Ne zaxouYHxa{CR$gA*GO>M7m-%ktNicgFMfuG5u6_tJkK#k%cnKX6Xi!ZFm~2ic59Nwr zcpI%8I=eN_NUG7G^GaZbZX3x1t`;z2q^^lzA+`4@2uaEkUw9|RlJVFb7Z(MP`T5=g zAbW8;z?l{~SB&ySS{FW0+Z-~!I=4L5`?y&D$+-W_#+w8SAbDl-Q-q{DUW!IEYz_t0 z3@T{l6m+z<9;-{yqI^Hhap(0XUG_A~_8{4SL$Ys4)uo<)cz2*}2T4h`b4KsQ9P7=k zc}Jz?xz3&rechcgP6r+PpGDjn`jnnLA z%2iZPiM7?#%g0yQy{TXF8{Qd>pXXU-oQsTBl|8eedhA;0=6rywykp@?z)01D(13&N z{@dN_*pt1}^As^g9!0=j8|Jj^TM~N(&RL8f4|h~5u@EY`GllS&L<(^PyKT}-G8fw} z>+XEbV>J;9XL-yG3mWB|l?Rf2a>Sl$v*u0K+I!P2XRY&Gn&ZjrBjriQl?_A{I`wltE(1YU9C+gTPl4Vliq(dvU zDx>SzPj88`H=<_+k`!O0q~sre*q$cQSH1=+;_ro-2!S*nyUEfWxZadfCsU{gisJ)f zQo+kN`=2iZpZiat_dm-4pXmoZz#?q>mz{qiep)5B9sxMJ_-h;+&XOi-O7qz2=0HntQ zoB#;aVugY3#Z~N$=?x(0e3dhOCTx2W6Yz&vASP}|+u#2IZe42ugM&ccK+gVu=u6Yg zpODp)GrbtE@j~N*ATd{3%7=#wlWVva$PXa%W7+lm;lk#77Vh1UIuNaG~kzhl7PSLvlS)O z2$a|1!mO~*n$D-Ip=xXs!0qY}$wzTO&H7|S85kPS`#0^9sQ(^>I{4p!P>nR-C=takA%Yv!-p0(@SGXDMbvw0;Ru1jLgk6GYk^*vfPP$!XOji-n+{uRDD$kqiV z6@f|o1FVB)WwHFlD}!SN0@Bu_fxo8M)e$_|%(#_|e5JB>V;5t9_PMyt-V*YlD!CNKU)Gb=^Pz8D4Wm!?Al8w<^Z)q&{oAv@1`^=TL43+o< z(}&q7))%%G8gKrLp=m3tNx36CI&*wJMbsJ7CEYo~(_?Zx02Z1sKGDvPp_{#P0tZCn zvP7D@eii`DQC-M=2mt-;v(KCeRZt`JX6+vWf<1Y*i-0<3q{RANquTX3hgnuF;A=!q z^afhbIc!%`%&Fn8I{4VFViQv`$YA4&=K(dl(>ZRPeFU_>MNjqERqp%c=*@NO;|hfs zE6^)s3D92Di8e)4-(P*)BSKK< z|F_*1m!OeTkDEu!J*@V28*SkAGmdx3aw%CQ48m<-hl>at2*jz7yK+squ%jsVlFXKazqfAB#4I!3!}(I_sZo%IU&fmGn2=W2b^@-bLeB@%Y`L z>zU3+!M3Vy%A*L)Zy-(LX$pY!jxEB5FDa{OSbvD@}!va#o7A?PA%@Vh3WH@^NlhM25)MLI};~d^f(XZ4uW$ z@8++5?p@EWj`!M#{T=MTBb2G{Bmo>{_?KToq(-s9j$``j=V&MZ)LL?0;x(GJXBsWk zk+Ql)K>7FL2}078;?|mcGABgI8U}Y+8?iJrD0Zfior-aN{9Ld;Q`b750tR2(OEHK| z@QG!=^hg-hVJ7ZGP0q6p3f_deU#`6-tqVB7(m$8z@xo`#*`yQWs64F*Z{w}T$e|bem zRZVIe-xHayHO&gP{iur5+^rjQs&?fk_uq65(z_H({DNLrUhVz5n{Ie`5N&wx-llOS zw}P^Z{VQd^9hY?~@)FnqVG>s>bw&G%y_va+OyJqRe{s$AZ{EFo2jr=d;%8Y;;C`}S zla5T}bm=JQY#@#k*MxEe7upXBGT=TKMs1s!AHTEX$vj_8X}oHB!Y;AtKCPI!G)>I0 zq>G%^t>WidPyfaBSDq~qs!jxM{2jBBZDP6kQH2{mJNH|Q5<>9yFV{jTE@Gpi?ytcX zz!r)8bmTKO4-Htnom75=5v-J(JRa|el?XS{7?W6+#Fnby=H{T-HoYM4^l(9^yl8y~>Y4JSUBdmCUl zWu?9SXqur}991fn%4o-u3P$L%gB;vNd5CyzJgy`5W$we=2}eHqe_f$AlFKo49RCB+ znuX|H8}D9OZEWb?%F1Aa32d+R7o}L++divBRq4Ku43LZ7z*F*nQ z)=?;3oKW+rucek_a+P{nHihG=0)oQD#0pd7{POAFl7s7V{FLyOqmuM8x<0K|ywMd* z*O)u`{w=+jh-<`~)O6OleoQLtn?K|(QE2%zR{lKo-;$09Zw)8s9p86lfsz2zhj$>z zD9|p_?xWP-2FsO_hVItVQ2pt%pW{AfD9;Tch@K(@)W1CjBrH*mrL3?Uo;mv=5cKnc zTKT|H96#pIe97o?l+eiDotwSn=>!r;|73=_t>UwA=pI$&Y8xOGqoZs;P literal 0 HcmV?d00001 diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index b6a44f76b7..29305405bc 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -14,3 +14,5 @@ add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) +add_subdirectory(15_fused_moe) + diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 3b198502d0..3cf0c2595d 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -52,6 +52,7 @@ #include "ck_tile/core/tensor/tile_elementwise.hpp" #include "ck_tile/core/tensor/tile_window.hpp" #include "ck_tile/core/tensor/tile_window_linear.hpp" +#include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/update_tile.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" @@ -62,6 +63,7 @@ #include "ck_tile/core/utility/philox_rand.hpp" #include "ck_tile/core/utility/random.hpp" #include "ck_tile/core/utility/reduce_operator.hpp" +#include "ck_tile/core/utility/static_counter.hpp" #include "ck_tile/core/utility/to_sequence.hpp" #include "ck_tile/core/utility/transpose_vectors.hpp" #include "ck_tile/core/utility/type_traits.hpp" diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 3feede4d2e..bebf035e9c 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -621,6 +621,65 @@ CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } +CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0) +{ + asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory"); +} + +template +struct buffer_atomic_add_if; + +template +struct buffer_atomic_add_if +{ + template + CK_TILE_DEVICE void operator()(const T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t /*s_offset*/, + index_t i_offset /*max 0xFFF*/, + index_t flag = 1) + { + static_assert(sizeof(T) == 4); + auto save_exec = __builtin_amdgcn_read_exec(); + using mbuf_t = float; + asm volatile("v_cmpx_le_u32 exec, 1, %4\n" + "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n" + "s_mov_b64 exec %5" + : + : "v"(v_offset), + "v"(bit_cast(value)), + "s"(res.xy), + "n"(i_offset), + "v"(flag), + "s"(save_exec) + : "memory"); + } +}; + +template +struct buffer_atomic_add; + +template +struct buffer_atomic_add +{ + template + CK_TILE_DEVICE void operator()(const T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t /*s_offset*/, + index_t i_offset /*max 0xFFF*/, + index_t /*flag = 1*/) + { + static_assert(sizeof(T) == 4); + using mbuf_t = float; + asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3" + : + : "v"(v_offset), "v"(bit_cast(value)), "s"(res.xy), "n"(i_offset) + : "memory"); + } +}; + namespace impl { // below type indicate the data type used for buffer load inline asm // clang-format off @@ -810,6 +869,11 @@ CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } +CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0) +{ + asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); +} + // buffer load i8 CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, @@ -2378,6 +2442,45 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ #endif } +template +CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer& src_thread_data, + T* p_dst_wave, + const index_t dst_thread_element_offset, + const index_t dst_linear_element_offset, + const bool dst_thread_element_valid, + const index_t dst_element_space_size, + bool_constant = {}) +{ + const int32x4_t dst_wave_buffer_resource = + make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); + + index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); + index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T); + + if constexpr(oob_conditional_check) + { + buffer_atomic_add_if{}(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + 0, + dst_linear_addr_offset, + dst_thread_element_valid); + } + else + { + buffer_atomic_add{}(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + 0, + dst_linear_addr_offset, + 1); + } +} + // buffer_atomic_max requires: // 1) p_dst_wave must point to global memory // 2) p_dst_wave must be a wavewise pointer. diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 65a3a4e2ff..afcf982a63 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -73,6 +73,24 @@ CK_TILE_DEVICE void block_sync_lds() #endif } +CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) +{ +#ifdef __gfx12__ + asm volatile("s_wait_loadcnt %0 \n" + "s_barrier_signal -1 \n" + "s_barrier_wait -1" + : + : "n"(cnt) + : "memory"); +#else + asm volatile("s_waitcnt vmcnt(%0) \n" + "s_barrier" + : + : "n"(cnt) + : "memory"); +#endif +} + CK_TILE_DEVICE void block_sync_lds_direct_load() { asm volatile("\ diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp index a88780459b..df0f54c5ed 100644 --- a/include/ck_tile/core/arch/utility.hpp +++ b/include/ck_tile/core/arch/utility.hpp @@ -102,4 +102,28 @@ CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane) #endif } +template +CK_TILE_DEVICE auto flag_to_exec(const T& v_flag) +{ + static_assert(sizeof(T) == 4); + // per-thread v_flag store into 2x sgpr + uint32x2_t exec_flag; + asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1" + : [s_exec_flag] "=s"(exec_flag) + : [v_flag] "v"(v_flag)); + return exec_flag; +} + +template +CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y) +{ + static_assert(sizeof(X) == 4 && sizeof(Y) == 4); + // per-thread cmp store into 2x sgpr + uint32x2_t exec_flag; + asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]" + : [s_exec_flag] "=s"(exec_flag) + : [v_x] "v"(x), [v_y] "v"(y)); + return exec_flag; +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 2cc788d422..7dffa0e555 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -437,34 +437,74 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x) + CK_TILE_DEVICE void update(index_t i, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}) { if constexpr(Op == memory_operation_enum::set) { - this->template set(i, linear_offset, is_valid_element, x); + this->template set(i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_add) { - this->template atomic_add(i, linear_offset, is_valid_element, x); + this->template atomic_add( + i, linear_offset, is_valid_element, x); } else if constexpr(Op == memory_operation_enum::atomic_max) { - this->template atomic_max(i, linear_offset, is_valid_element, x); + this->template atomic_max( + i, linear_offset, is_valid_element, x); } // FIXME: remove memory_operation_enum::add else if constexpr(Op == memory_operation_enum::add) { - auto tmp = this->template get(i, linear_offset, is_valid_element); - this->template set(i, linear_offset, is_valid_element, x + tmp); + auto tmp = + this->template get(i, linear_offset, is_valid_element); + this->template set( + i, linear_offset, is_valid_element, x + tmp); // tmp += x; // this->template set(i, is_valid_element, tmp); } } + // i is offset of T, not X. i should be aligned to X + template >::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE void update_raw(index_t i, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + if constexpr(Op == memory_operation_enum::set) + { + this->template set_raw(i, linear_offset, is_valid_element, x); + } + else if constexpr(Op == memory_operation_enum::atomic_add) + { + this->template atomic_add_raw( + i, linear_offset, is_valid_element, x); + } + else if constexpr(Op == memory_operation_enum::atomic_max) + { + // this->template atomic_max_raw(i, linear_offset, is_valid_element, x); + } + } + // i is offset of T, not X. i should be aligned to X template >::scalar_type, typename vector_traits>::scalar_type>::value, @@ -585,6 +626,39 @@ struct buffer_view>::scalar_type, + typename vector_traits>::scalar_type>::value, + bool>::type = false> + CK_TILE_DEVICE void + atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x) + { + // using scalar_t = typename vector_traits>::scalar_type; + + // X contains multiple T + constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; + + constexpr index_t scalar_per_x_vector = vector_traits>::vector_size; + + static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, + "wrong! X should contain multiple T"); + + static_assert(get_address_space() == address_space_enum::global, "only support global mem"); + + constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + + amd_buffer_atomic_add_raw, + t_per_x, + Coherence, + oob_conditional_check, + pre_nop>( + x, p_data_, i, linear_offset, is_valid_element, buffer_size_); + } + + template >::scalar_type, typename vector_traits>::scalar_type>::value, diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index f150fc54ca..b280a1725d 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -22,28 +22,32 @@ template CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(number<-1>{}, bool_constant{}); + return tile_window.load(number{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile(const tile_window_linear& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(number<-1>{}, bool_constant{}); + return tile_window.load(number{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile, const tile_window_with_static_distribution& tile_window, + number = {}, bool_constant = {}) { - return tile_window.load(dst_tile, bool_constant{}); + return tile_window.load(dst_tile, number{}, bool_constant{}); +} + +template +CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile, + const tile_window_linear& tile_window, + number = {}, + bool_constant = {}) +{ + return tile_window.load(dst_tile, number{}, bool_constant{}); } /** @@ -76,6 +100,7 @@ template CK_TILE_DEVICE auto load_tile_raw(T& tile, @@ -83,11 +108,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile, WindowLengths_, TileDistribution_, NumCoord>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { tile_window.load_raw( - tile, number<-1>{}, bool_constant{}, bool_constant{}); + tile, number{}, bool_constant{}, bool_constant{}); } template CK_TILE_DEVICE auto load_tile_raw(T& tile, @@ -102,11 +129,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile, WindowLengths_, TileDistribution_, LinearBottomDims_>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { tile_window.load_raw( - tile, number<-1>{}, bool_constant{}, bool_constant{}); + tile, number{}, bool_constant{}, bool_constant{}); } template CK_TILE_DEVICE auto @@ -122,11 +151,14 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile, WindowLengths_, TileDistribution_, NumCoord>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { - return tile_window.async_load_raw( - lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); + return tile_window.async_load_raw(lds_tile, + number{}, + bool_constant{}, + bool_constant{}); } template CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile, @@ -141,11 +174,14 @@ CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile, WindowLengths_, TileDistribution_, LinearBottomDims_>& tile_window, + number = {}, bool_constant = {}, bool_constant = {}) { - return tile_window.async_load_raw( - lds_tile, number<-1>{}, bool_constant{}, bool_constant{}); + return tile_window.async_load_raw(lds_tile, + number{}, + bool_constant{}, + bool_constant{}); } CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0) diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp index 29c20bed00..568d618ec2 100644 --- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp +++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp @@ -201,4 +201,30 @@ CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number return unpacks; } +namespace detail { + +// check if 2 static_distributed_tensor has same data type and size of element +// but only difference in distribution +template +struct is_similiar_distributed_tensor +{ + static constexpr bool value = false; +}; + +template +struct is_similiar_distributed_tensor, + static_distributed_tensor> +{ + using Tx = static_distributed_tensor; + using Ty = static_distributed_tensor; + static constexpr bool value = std::is_same_v && + Tx::get_thread_buffer_size() == Ty::get_thread_buffer_size(); +}; + +template +inline constexpr bool is_similiar_distributed_tensor_v = + is_similiar_distributed_tensor::value; + +} // namespace detail + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp index 698ce5378d..4c72ed0859 100644 --- a/include/ck_tile/core/tensor/tensor_view.hpp +++ b/include/ck_tile/core/tensor/tensor_view.hpp @@ -333,6 +333,48 @@ struct tensor_view coord.get_offset(), linear_offset, is_valid_element, x); } + // X is vector of DataType. + // "coord" is coordinate of DataType, not X. "coord" should be aligned to X + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + buf_.template update_raw( + coord.get_offset(), + linear_offset, + coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord), + x); + } + + template >::scalar_type, + typename vector_traits>::scalar_type>, + bool>::type = false> + CK_TILE_HOST_DEVICE constexpr void + update_vectorized_elements_raw(const TensorCoord& coord, + index_t linear_offset, + bool is_valid_element, + const X& x, + bool_constant = {}, + bool_constant = {}) + { + buf_.template update_raw( + coord.get_offset(), linear_offset, is_valid_element, x); + } + CK_TILE_HOST_DEVICE void print() const { printf("tensor_view{"); diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index e410246983..caeb038521 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -292,12 +292,15 @@ struct tile_window_with_static_distribution { constexpr auto tile_dstr = TileDstr{}; auto dst_tensor = make_static_distributed_tensor(tile_dstr); - load(dst_tensor, bool_constant{}); + load(dst_tensor, number{}, bool_constant{}); return dst_tensor; } - template + template CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor, + number = {}, bool_constant = {}) const { using Traits = load_store_traits; @@ -785,6 +788,73 @@ struct tile_window_with_static_distribution }); } + template + CK_TILE_DEVICE void update_raw(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) const + { + using Traits = load_store_traits; + + using vector_t = typename Traits::vector_t; + using SFC_Ys = typename Traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + static_for<0, NumCoord, 1>{}([&](auto iCoord) { + /// TODO: use structure binding (to be captured later) if compiled in C++20 + auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; + auto bottom_tensor_thread_coord = pre_computed_coords_[iCoord][I1]; + + static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) { + constexpr auto iAccess = number{}; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j) + : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = + tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template update_vectorized_elements_raw( + bottom_tensor_thread_coord, + 0, + vec_value, + bool_constant{}, + bool_constant{}); + + // move thread coordinate + if constexpr(iCoordAccess != (NumAccessPerCoord - 1)) + { + constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess); + + constexpr auto idx_diff_ps_ys = container_concat( + generate_tuple([&](auto) { return number<0>{}; }, number{}), + idx_diff_ys); + + move_window_adaptor_and_bottom_tensor_thread_coordinate( + window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys); + } + }); + }); + } + // move thread's botom tensor coordiante // [x0', x1', ... ] ==> [offset] // also move window-origin diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp index 4b921ec5b9..96a8352c04 100644 --- a/include/ck_tile/core/tensor/tile_window_linear.hpp +++ b/include/ck_tile/core/tensor/tile_window_linear.hpp @@ -432,23 +432,38 @@ struct tile_window_linear CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number) { constexpr auto linear_coord = get_bottom_linear_coordinate(number{}); - // since this is linear offset, we assum bottom X tensor is always linear - constexpr index_t linear_offset = [&]() { - constexpr auto x_idx_ = linear_coord; - constexpr auto x_len_ = TileDstr{}.get_lengths(); - static_assert(x_idx_.size() == x_len_.size()); - constexpr index_t x_dims_ = x_idx_.size(); - index_t cu_stride_ = 1; - index_t cu_offset_ = 0; - static_for<0, x_dims_, 1>{}([&](auto i_) { - auto r_i_ = number{}; - cu_offset_ += x_idx_[r_i_] * cu_stride_; - cu_stride_ *= x_len_[r_i_]; - }); - return cu_offset_; - }(); - - return linear_offset; + constexpr auto is_pure_linear_tensor = + reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{}); + if constexpr(is_pure_linear_tensor) + { + // this case usually is a LDS window, everything is known at compile tile. + // we directly use BottomTensorView transform to compute the offset, in case padding + auto bottom_tensor_coord = + make_tensor_coordinate(BottomTensorView{}.get_tensor_descriptor(), linear_coord); + return bottom_tensor_coord.get_offset(); + } + else + { + // this case usually is a global window, where last dim can be linear + // we hack here, that use the original TileDstr to compute the linear offset + // ... hoping that there is no extra padding between other dims, which make sense + // since that would introduce runtime length (so can't use linear offset) + constexpr index_t linear_offset = [&]() { + constexpr auto x_idx_ = linear_coord; + constexpr auto x_len_ = TileDstr{}.get_lengths(); + static_assert(x_idx_.size() == x_len_.size()); + constexpr index_t x_dims_ = x_idx_.size(); + index_t cu_stride_ = 1; + index_t cu_offset_ = 0; + static_for<0, x_dims_, 1>{}([&](auto i_) { + auto r_i_ = number{}; + cu_offset_ += x_idx_[r_i_] * cu_stride_; + cu_stride_ *= x_len_[r_i_]; + }); + return cu_offset_; + }(); + return linear_offset; + } } CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; } @@ -509,6 +524,64 @@ struct tile_window_linear return dst_tensor; } + template + CK_TILE_DEVICE auto load(DstTile& dst_tensor, + number = {}, + bool_constant = {}) const + { + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // auto dst_tensor = make_static_distributed_tensor(tile_dstr); + + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + auto bottom_tensor_flag = cached_flags_[IAccess]; + + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + + // read from bottom tensor + const vector_t vec_value = + get_bottom_tensor_view().template get_vectorized_elements( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + bool_constant{}); +#if 1 + // data index [y0, y1, ...] + constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess); + // write into distributed tensor + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + dst_tensor.get_thread_buffer().template at() = + vec_value.template get_as()[j]; + }); +#else + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start); + static_assert(d % traits::ScalarPerVector == 0); + + dst_tensor.get_thread_buffer().template get_as()( + number{}) = bit_cast(vec_value); +#endif + }; + + WINDOW_DISPATCH_ISSUE(); + + return dst_tensor; + } + template + CK_TILE_DEVICE void update_raw(const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) const + { + + using vector_t = typename traits::vector_t; + using SFC_Ys = typename traits::SFC_Ys; + + constexpr auto tile_dstr = TileDstr{}; + + // loop over thread tensor space [y0, y1, ...] + auto issue = [&](auto i_access_) { + constexpr auto IAccess = number{}; + constexpr auto non_linear_id = number{}; + auto bottom_tensor_thread_coord = cached_coords_[non_linear_id]; + constexpr auto linear_offset = get_bottom_linear_offset(IAccess); + auto bottom_tensor_flag = cached_flags_[IAccess]; + + // data index [y0, y1, ...] + constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess); + + // read from distributed tensor + vector_t vec_value; + + static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) { + constexpr auto idx_ys = generate_tuple( + [&](auto jj) { + return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj]; + }, + number{}); + + constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys); + + vec_value.template get_as()(j) = + dstr_tensor.get_thread_buffer().template at(); + }); + + // write into bottom tensor + get_bottom_tensor_view().template update_vectorized_elements_raw( + bottom_tensor_thread_coord, + linear_offset, + bottom_tensor_flag, + vec_value, + bool_constant{}, + bool_constant{}); + }; + + WINDOW_DISPATCH_ISSUE(); + } + // move thread's botom tensor coordiante // [x0', x1', ... ] ==> [offset] // also move window-origin diff --git a/include/ck_tile/core/tensor/tile_window_utils.hpp b/include/ck_tile/core/tensor/tile_window_utils.hpp new file mode 100644 index 0000000000..71a72329f8 --- /dev/null +++ b/include/ck_tile/core/tensor/tile_window_utils.hpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/core/arch/arch.hpp" +#include "ck_tile/core/arch/utility.hpp" +#include "ck_tile/core/algorithm/space_filling_curve.hpp" +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/container/array.hpp" +#include "ck_tile/core/container/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/container/container_helper.hpp" +#include "ck_tile/core/tensor/static_distributed_tensor.hpp" +#include "ck_tile/core/tensor/tensor_adaptor.hpp" +#include "ck_tile/core/tensor/tile_distribution.hpp" +#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/type_traits.hpp" + +#pragma once +namespace ck_tile { + +// input a lds store tile, extract some information from it +// used to set m0 value for gfx9 serious +template +CK_TILE_DEVICE auto get_async_store_smem_info(LdsTileWindow_&& lds_tile) +{ + using LdsTileWindow = remove_cvref_t; + using LdsDataType = typename LdsTileWindow::DataType; + + // issues * warps * lanes + static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded + + const index_t size_per_buf = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType); + + const index_t size_per_wave = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<0>{}, number<1>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t size_per_issue = + lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset( + make_tuple(number<1>{}, number<0>{}, number<0>{})) * + sizeof(LdsDataType) - + size_per_buf; + + const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(); + + return make_tuple(m0_init_value, size_per_issue); +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/tensor/update_tile.hpp b/include/ck_tile/core/tensor/update_tile.hpp index fbce7c4083..570abde189 100644 --- a/include/ck_tile/core/tensor/update_tile.hpp +++ b/include/ck_tile/core/tensor/update_tile.hpp @@ -41,15 +41,65 @@ template + typename DataType_, + index_t i_access = -1, + bool oob_conditional_check = true> CK_TILE_DEVICE void update_tile(tile_window_with_static_distribution& tile_window, - const static_distributed_tensor& dstr_tensor) + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}) { - tile_window.update(dstr_tensor); + tile_window.update(dstr_tensor, number{}, bool_constant{}); +} + +template +CK_TILE_DEVICE void +update_tile_raw(tile_window_with_static_distribution& tile_window, + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) +{ + tile_window.update_raw(dstr_tensor, + number{}, + bool_constant{}, + bool_constant{}); +} + +template +CK_TILE_DEVICE auto update_tile_raw( + tile_window_linear& + tile_window, + const static_distributed_tensor& dstr_tensor, + number = {}, + bool_constant = {}, + bool_constant = {}) +{ + tile_window.update_raw(dstr_tensor, + number{}, + bool_constant{}, + bool_constant{}); } } // namespace ck_tile diff --git a/include/ck_tile/core/utility/static_counter.hpp b/include/ck_tile/core/utility/static_counter.hpp new file mode 100644 index 0000000000..84af3dd52f --- /dev/null +++ b/include/ck_tile/core/utility/static_counter.hpp @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +namespace ck_tile { + +template +struct static_counter +{ + public: + template + static constexpr index_t next() + { + return next(0) * Step + Start; + } + + template + static constexpr index_t next() + { + struct Unique + { + }; + return next(0) * Step + Start; + } + + template + static constexpr index_t current() + { + return current(0) * Step + Start; + } + + template + static constexpr index_t current() + { + struct Unique + { + }; + return current(0) * Step + Start; + } + + private: + template + struct slot + { + _Pragma("GCC diagnostic push"); + _Pragma("GCC diagnostic ignored \"-Wundefined-internal\""); + friend constexpr bool slot_allocated(slot); + _Pragma("GCC diagnostic pop"); + }; + + template + struct allocate_slot + { + friend constexpr bool slot_allocated(slot) { return true; } + enum + { + value = I + }; + }; + + // If slot_allocated(slot) has NOT been defined, then SFINAE will keep this function out of + // the overload set... + template ())> + static constexpr index_t next(index_t) + { + return next(0); + } + + // ...And this function will be used, instead, which will define slot_allocated(slot) via + // allocate_slot. + template + static constexpr index_t next(double) + { + return allocate_slot::value; + } + + // If slot_allocated(slot) has NOT been defined, then SFINAE will keep this function out of + // the overload set... + template ())> + static constexpr index_t current(index_t) + { + return current(0); + } + + // ...And this function will be used, instead, which will return the current counter, or assert + // in case next() hasn't been called yet. + template + static constexpr index_t current(double) + { + static_assert(I != 0, "You must invoke next() first"); + + return I - 1; + } +}; + +namespace impl { +template +struct static_counter_uniq_; +} + +#define MAKE_SC() \ + ck_tile::static_counter> {} +#define MAKE_SC_WITH(start_, step_) \ + ck_tile::static_counter, start_, step_> {} +#define NEXT_SC(c_) c_.next<__COUNTER__>() +#define NEXT_SCI(c_, static_i_) c_.next<__COUNTER__ + static_i_>() + +// Usage: +// constexpr auto c = MAKE_SC() +// NEXT_SC(c) // -> constexpr 0 +// NEXT_SC(c) // -> constexpr 1 +// NEXT_SC(c) // -> constexpr 2 +} // namespace ck_tile diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index 2e96009ace..2f3a302eea 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -11,6 +11,7 @@ #include "ck_tile/host/fill.hpp" #include "ck_tile/host/hip_check_error.hpp" #include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/host/ranges.hpp" #include "ck_tile/host/reference/reference_batched_dropout.hpp" @@ -20,6 +21,7 @@ #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp" #include "ck_tile/host/reference/reference_batched_softmax.hpp" #include "ck_tile/host/reference/reference_elementwise.hpp" +#include "ck_tile/host/reference/reference_fused_moe.hpp" #include "ck_tile/host/reference/reference_gemm.hpp" #include "ck_tile/host/reference/reference_im2col.hpp" #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp" diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp index 7c8549f74f..13684c0e24 100644 --- a/include/ck_tile/host/device_memory.hpp +++ b/include/ck_tile/host/device_memory.hpp @@ -7,6 +7,7 @@ #include #include #include "ck_tile/host/hip_check_error.hpp" +#include "ck_tile/host/host_tensor.hpp" namespace ck_tile { template @@ -36,6 +37,19 @@ struct DeviceMem mpDeviceBuf = nullptr; } } + template + DeviceMem(const HostTensor& t) : mMemSize(t.get_element_space_size_in_bytes()) + { + if(mMemSize != 0) + { + HIP_CHECK_ERROR(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); + } + else + { + mpDeviceBuf = nullptr; + } + ToDevice(t.data()); + } void Realloc(std::size_t mem_size) { if(mpDeviceBuf) @@ -92,6 +106,27 @@ struct DeviceMem HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); } } + + // construct a host tensor with type T + template + HostTensor ToHost(std::size_t cpySize) + { + // TODO: host tensor could be slightly larger than the device tensor + // we just copy all data from GPU buffer + std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T); + HostTensor h_({host_elements}); + if(mpDeviceBuf) + { + HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); + } + return h_; + } + template + HostTensor ToHost() + { + return ToHost(mMemSize); + } + void SetZero() const { if(mpDeviceBuf) diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp index 335911860a..f24c338755 100644 --- a/include/ck_tile/host/fill.hpp +++ b/include/ck_tile/host/fill.hpp @@ -13,6 +13,7 @@ #include #include "ck_tile/core.hpp" +#include "ck_tile/host/joinable_thread.hpp" namespace ck_tile { @@ -22,13 +23,44 @@ struct FillUniformDistribution float a_{-5.f}; float b_{5.f}; std::optional seed_{11939}; + // ATTENTION: threaded does not guarantee the distribution between thread + bool threaded = false; template void operator()(ForwardIter first, ForwardIter last) const { - std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); - std::uniform_real_distribution dis(a_, b_); - std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + if(threaded) + { + uint32_t num_thread = std::thread::hardware_concurrency(); + auto total = static_cast(std::distance(first, last)); + auto work_per_thread = static_cast((total + num_thread - 1) / num_thread); + + std::vector threads(num_thread); + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = std::min((it + 1) * work_per_thread, total); + auto thread_f = [this, total, iw_begin, iw_end, &first] { + if(iw_begin > total || iw_end > total) + return; + // need to make each thread unique, add an offset to current seed + std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin) + : std::random_device{}()); + std::uniform_real_distribution dis(a_, b_); + std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() { + return ck_tile::type_convert(dis(gen)); + }); + }; + threads[it] = joinable_thread(thread_f); + } + } + else + { + std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); + std::uniform_real_distribution dis(a_, b_); + std::generate( + first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + } } template @@ -115,13 +147,44 @@ struct FillNormalDistribution float mean_{0.f}; float variance_{1.f}; std::optional seed_{11939}; + // ATTENTION: threaded does not guarantee the distribution between thread + bool threaded = false; template void operator()(ForwardIter first, ForwardIter last) const { - std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); - std::normal_distribution dis(mean_, std::sqrt(variance_)); - std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + if(threaded) + { + uint32_t num_thread = std::thread::hardware_concurrency(); + auto total = static_cast(std::distance(first, last)); + auto work_per_thread = static_cast((total + num_thread - 1) / num_thread); + + std::vector threads(num_thread); + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = std::min((it + 1) * work_per_thread, total); + auto thread_f = [this, total, iw_begin, iw_end, &first] { + if(iw_begin > total || iw_end > total) + return; + // need to make each thread unique, add an offset to current seed + std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin) + : std::random_device{}()); + std::normal_distribution dis(mean_, std::sqrt(variance_)); + std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() { + return ck_tile::type_convert(dis(gen)); + }); + }; + threads[it] = joinable_thread(thread_f); + } + } + else + { + std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); + std::normal_distribution dis(mean_, std::sqrt(variance_)); + std::generate( + first, last, [&dis, &gen]() { return ck_tile::type_convert(dis(gen)); }); + } } template @@ -235,6 +298,44 @@ struct FillMonotonicSeq } }; +template +struct FillStepRange +{ + float start_value_{0}; + float end_value_{3}; + float step_{1}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::generate(first, last, [=, n = start_value_]() mutable { + auto tmp = n; + n += step_; + if constexpr(IsAscending) + { + if(n > end_value_) + n = start_value_; + } + else + { + if(n < end_value_) + n = start_value_; + } + + return type_convert(tmp); + }); + } + + template + auto operator()(ForwardRange&& range) const -> std::void_t< + decltype(std::declval()(std::begin(std::forward(range)), + std::end(std::forward(range))))> + { + (*this)(std::begin(std::forward(range)), + std::end(std::forward(range))); + } +}; + template struct FillConstant { diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index 5610ba324d..3902cad178 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -8,12 +8,13 @@ #include #include #include -#include #include #include #include +#include #include "ck_tile/core.hpp" +#include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/host/ranges.hpp" namespace ck_tile { @@ -213,23 +214,6 @@ CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old return HostTensorDescriptor(new_lengths, new_strides); } -struct joinable_thread : std::thread -{ - template - joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) - { - } - - joinable_thread(joinable_thread&&) = default; - joinable_thread& operator=(joinable_thread&&) = default; - - ~joinable_thread() - { - if(this->joinable()) - this->join(); - } -}; - template struct ParallelTensorFunctor { @@ -590,6 +574,107 @@ struct HostTensor size() * FromSize / ToSize}; } + friend std::ostream& operator<<(std::ostream& os, const HostTensor& t) + { + os << t.mDesc; + os << "["; + for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx) + { + if(0 < idx) + { + os << ", "; + } + if constexpr(std::is_same_v || std::is_same_v) + { + os << type_convert(t.mData[idx]) << " #### "; + } + else + { + os << t.mData[idx]; + } + } + os << "]"; + return os; + } + + // read data from a file, as dtype + // the file could dumped from torch as (targeting tensor is t here) + // numpy.savetxt("f.txt", t.view(-1).numpy()) + // numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save + // numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int + // will output f.txt, each line is a value + // dtype=float or int, internally will cast to real type + void loadtxt(std::string file_name, std::string dtype = "float") + { + std::ifstream file(file_name); + + if(file.is_open()) + { + std::string line; + + index_t cnt = 0; + while(std::getline(file, line)) + { + if(cnt >= static_cast(mData.size())) + { + throw std::runtime_error(std::string("data read from file:") + file_name + + " is too big"); + } + + if(dtype == "float") + { + mData[cnt] = type_convert(std::stof(line)); + } + else if(dtype == "int" || dtype == "int32") + { + mData[cnt] = type_convert(std::stoi(line)); + } + cnt++; + } + file.close(); + if(cnt < static_cast(mData.size())) + { + std::cerr << "Warning! reading from file:" << file_name + << ", does not match the size of this tensor" << std::endl; + } + } + else + { + // Print an error message to the standard error + // stream if the file cannot be opened. + throw std::runtime_error(std::string("unable to open file:") + file_name); + } + } + + // can save to a txt file and read from torch as: + // torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous() + void savetxt(std::string file_name, std::string dtype = "float") + { + std::ofstream file(file_name); + + if(file.is_open()) + { + for(auto& itm : mData) + { + if(dtype == "float") + file << type_convert(itm) << std::endl; + else if(dtype == "int") + file << type_convert(itm) << std::endl; + else + // TODO: we didn't implement operator<< for all custom + // data types, here fall back to float in case compile error + file << type_convert(itm) << std::endl; + } + file.close(); + } + else + { + // Print an error message to the standard error + // stream if the file cannot be opened. + throw std::runtime_error(std::string("unable to open file:") + file_name); + } + } + Descriptor mDesc; Data mData; }; diff --git a/include/ck_tile/host/joinable_thread.hpp b/include/ck_tile/host/joinable_thread.hpp new file mode 100644 index 0000000000..a822f967dc --- /dev/null +++ b/include/ck_tile/host/joinable_thread.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +namespace ck_tile { + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) + { + } + + joinable_thread(joinable_thread&&) = default; + joinable_thread& operator=(joinable_thread&&) = default; + + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_fused_moe.hpp b/include/ck_tile/host/reference/reference_fused_moe.hpp new file mode 100644 index 0000000000..bf89f92759 --- /dev/null +++ b/include/ck_tile/host/reference/reference_fused_moe.hpp @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace ck_tile { +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float +// number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, +// 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 +// -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, +// c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +/// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] + +template +void reference_fused_moe( + const ck_tile::HostTensor& a_host, // [tokens, hidden_size] + const ck_tile::HostTensor& g_host, // [experts, interme_size_0, hidden_size] + const ck_tile::HostTensor& d_host, // [experts, hidden_size, interme_size_1] + const ck_tile::HostTensor& sa_host, // [tokens, 1], + const ck_tile::HostTensor& sg_host, // [experts, 1, interme_size_0] + const ck_tile::HostTensor& sd_host, // [experts, 1, hidden_size], + const ck_tile::HostTensor& sy_host, // [experts, 1, interme_size_0] + ck_tile::HostTensor& o_host, // [tokens, hidden_size] + const ck_tile::HostTensor& sorted_token_ids_host, // [max_num_tokens_padded] + const ck_tile::HostTensor& sorted_weight_host, // [max_num_tokens_padded] + const ck_tile::HostTensor& + sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size] + const ck_tile::HostTensor& num_sorted_tiles_host, // [1] + + const ck_tile::HostTensor& + token_ids_host, // [tokens, topk] --> ugly!!! remove in the future + + ck_tile::index_t block_m, + ck_tile::index_t tokens, + ck_tile::index_t experts, + ck_tile::index_t hidden_size, + ck_tile::index_t intermediate_size, // this size is for gate/up + ck_tile::index_t topk, + ck_tile::index_t gate_only) +{ + assert(sorted_token_ids_host.get_num_of_dimension() == 1); + assert(sorted_weight_host.get_num_of_dimension() == 1); + assert(sorted_expert_ids_host.get_num_of_dimension() == 1); + assert(num_sorted_tiles_host.get_element_size() == 1); + ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m; + ck_tile::index_t intermediate_size_0 = intermediate_size; + ck_tile::index_t intermediate_size_1 = intermediate_size / (gate_only ? 1 : 2); + + // TODO: better remove this in the future, or modify the token_id value + auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) { + for(ck_tile::index_t i_ = 0; i_ < topk; i_++) + { + if(token_ids_host(token_id_, i_) == expert_id_) + return i_; + } + throw std::runtime_error("not correct token/expert pair\n"); + return -1; // TODO: not correct!! + }; + + ck_tile::HostTensor out_topk_tokens({tokens, topk, hidden_size}); + + int max_num_tokens_padded = topk * tokens + experts * block_m - topk; + // assert(); + auto f = [&](auto i_flatten) { + ck_tile::index_t i_tile = i_flatten / block_m; + if(i_tile >= num_sorted_tiles) + return; + ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile]; + ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten]; + if(i_token >= tokens) + return; + ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly + auto weight = sorted_weight_host.mData[i_flatten]; + + ck_tile::HostTensor acc_0({1, intermediate_size_0}); + // first gemm + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++) + { + AccDataType acc = static_cast(0); + for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++) + { + acc += type_convert(a_host(i_token, i_k)) * + type_convert(g_host(i_expert, i_n, i_k)); + } + acc_0(0, i_n) = acc; + // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc); + } + + ck_tile::HostTensor y({1, intermediate_size_1}); + if(gate_only) + { + if(intermediate_size_1 != intermediate_size_0) + throw std::runtime_error( + "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) + + ", 1:" + std::to_string(intermediate_size_1)); + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++) + { + Activation{}(y(0, i_n), acc_0(0, i_n)); + // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n)); + } + } + else + { + if(intermediate_size_1 * 2 != intermediate_size_0) + throw std::runtime_error( + "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) + + ", 1:" + std::to_string(intermediate_size_1)); + for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++) + { + AccDataType tmp; + Activation{}(tmp, acc_0(0, i_n)); + y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul + } + } + + // second gemm, loop along gemm-n + ck_tile::HostTensor acc_1({1, hidden_size}); + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + AccDataType acc = static_cast(0); + for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++) + { + acc += y(0, i_k) * type_convert(d_host(i_expert, i_n, i_k)); + } + acc_1(0, i_n) = acc * weight; // multiple weight here + } + + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n); + } + }; + + // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f, max_num_tokens_padded)(1); + + // reduce + auto r = [&](auto i_token) { + for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) + { + AccDataType acc = type_convert(0); + for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++) + { + acc += out_topk_tokens(i_token, i_topk, i_n); + } + o_host(i_token, i_n) = type_convert(acc); + } + }; + make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency()); + + (void)num_sorted_tiles_host; + (void)sa_host; + (void)sg_host; + (void)sd_host; + (void)sy_host; +} +} // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp index 14ed4f815e..4e0f1a877e 100644 --- a/include/ck_tile/host/reference/reference_permute.hpp +++ b/include/ck_tile/host/reference/reference_permute.hpp @@ -16,7 +16,7 @@ namespace ck_tile { */ template CK_TILE_HOST void -reference_permute(const HostTensor& x, HostTensor& y, std::vector dims) +reference_permute(const HostTensor& x, HostTensor& y, std::vector perm) { const auto x_len = x.mDesc.get_lengths(); const auto y_len = y.mDesc.get_lengths(); @@ -43,7 +43,7 @@ reference_permute(const HostTensor& x, HostTensor& y, std::v std::vector tmp(rank, 0); for(index_t i = 0; i < rank; i++) { - tmp[dims[i]] = y_coord[i]; + tmp[perm[i]] = y_coord[i]; } return tmp; }(); @@ -54,4 +54,23 @@ reference_permute(const HostTensor& x, HostTensor& y, std::v make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency()); } + +template +CK_TILE_HOST auto reference_permute(const HostTensor& x, std::vector perm) +{ + auto x_shape = x.get_lengths(); + ck_tile::index_t rank = perm.size(); + std::vector y_shape = [&]() { + std::vector tmp(rank, 0); + for(int i = 0; i < static_cast(rank); i++) + { + tmp[i] = x_shape[perm[i]]; + } + return tmp; + }(); + + HostTensor y(y_shape); + reference_permute(x, y, perm); + return y; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp index 01217e16ce..e24b1ba767 100644 --- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp +++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp @@ -572,6 +572,105 @@ struct FastGelu } }; +struct FastGeluAsm +{ + template + CK_TILE_HOST void operator()(Y& y, const X& x) const; + + template + CK_TILE_DEVICE void operator()(Y& y, const X& x) const; + + template <> + CK_TILE_HOST void operator()(float& y, const float& x) const + { + // const float u = -2.f * x * (0.035677f * x * x + 0.797885f); + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u = x * (c1 * x * x + c2); + const float emu = exp(u); + y = x / (1.f + emu); + } + + // device code, use lower precision "__ocml_exp_f32" and "rcp" + template <> + CK_TILE_DEVICE void operator()(float& y, const float& x) const + { + const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v; + float tmp; + + asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x] ; x*x\n" + "v_fma_f32 %[v_tmp], %[v_tmp], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_mul_f32 %[v_tmp], %[v_tmp], %[v_x] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_exp_f32 %[v_tmp], %[v_tmp] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "s_nop 0 ; hazard for exp\n" + "v_add_f32 %[v_tmp], %[v_tmp], 1.0 ; emu+1.0f\n" + "v_rcp_f32 %[v_tmp], %[v_tmp] ; 1/(emu+1.0f)\n" + "s_nop 0 ; hazard for rcp \n" + "v_mul_f32 %[v_y], %[v_tmp], %[v_x] ; x * 1/(emu+1f)\n" + : [v_y] "=v"(y), [v_tmp] "+v"(tmp) + : [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_) + :); + } + + template <> + CK_TILE_HOST void operator()(fp32x2_t& y, const fp32x2_t& x) const + { + const float c1 = -2.0 * 0.035677f; + const float c2 = -2.0 * 0.797885f; + const float u0 = x.x * (c1 * x.x * x.x + c2); + const float emu0 = exp(u0); + y.x = x.x / (1.f + emu0); + const float u1 = x.y * (c1 * x.y * x.y + c2); + const float emu1 = exp(u1); + y.y = x.y / (1.f + emu1); + } + + // this is packed verion to remove data hazard for trans + template <> + CK_TILE_DEVICE void operator()(fp32x2_t& y, const fp32x2_t& x) const + { + const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f; + float c2 = -2.0 * 0.797885f; + const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v; + float tmp0, tmp1; + float y0 = x.x, y1 = x.y; + + asm volatile( + "v_mul_f32 %[v_tmp0], %[v_y0], %[v_y0] ; x*x\n" + "v_mul_f32 %[v_tmp1], %[v_y1], %[v_y1] ; x*x\n" + "v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2] ; c1*x*x+c2\n" + "v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_y0] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_y1] ; x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n" + "v_exp_f32 %[v_tmp0], %[v_tmp0] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "v_exp_f32 %[v_tmp1], %[v_tmp1] ; emu = exp2(log2e*x*(c1*x*x+c2))\n" + "v_add_f32 %[v_tmp0], %[v_tmp0], 1.0 ; emu+1.0f\n" + "v_add_f32 %[v_tmp1], %[v_tmp1], 1.0 ; emu+1.0f\n" + "v_rcp_f32 %[v_tmp0], %[v_tmp0] ; 1/(emu+1.0f)\n" + "v_rcp_f32 %[v_tmp1], %[v_tmp1] ; 1/(emu+1.0f)\n" + "v_mul_f32 %[v_y0], %[v_tmp0], %[v_y0] ; x * 1/(emu+1f)\n" + "v_mul_f32 %[v_y1], %[v_tmp1], %[v_y1] ; x * 1/(emu+1f)\n" + : [v_y0] "+v"(y0), + [v_y1] "+v"(y1), + [v_c2] "+v"(c2), + // NOTE! it is totally possible that c2/y0/y1 share same register, they are all local + // tmp variables we need to expicitly hint compiler they may read+write, to allow + // allocate different register , the side effect is c2=** may issue for every such + // inline asm block + [v_tmp0] "+v"(tmp0), + [v_tmp1] "+v"(tmp1) + : [s_c1] "s"(c1), [s_log2e] "s"(log2e_) + :); + y.x = y0; + y.y = y1; + } +}; + // https://paperswithcode.com/method/gelu // y = 0.5*x*(1+erf(x/sqrt(2))) struct Gelu diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp new file mode 100644 index 0000000000..eee80cda4a --- /dev/null +++ b/include/ck_tile/ops/flatmm.hpp @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" +#include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp new file mode 100644 index 0000000000..f5c7caf7df --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" + +namespace ck_tile { + +// A async load to LDS, B direct to AGPR +// B matrix preshuffled in br*kr*w +// require 4 wave, occupancy=1c +// agpr useage:256 +// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112) +// +// for this gemm, 4 16x16x16 transposed layout +// input A vpgpr layout +// v0-v15: [ 0:15](gemm_m)x128(gemm_k) +// v16-v31: [16:31](gemm_m)x128(gemm_k) + +// input B vpgpr layout +// v0-v15: [ 0: 15](gemm_n)x128(gemm_k) +// v16-v31: [ 64: 79](gemm_n)x128(gemm_k) +// ...................... +// v111-v127: [448:463](gemm_n)x128(gemm_k) + +// output C vpgpr layout +// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n) +// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n) +// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n) +// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n) +// ...................... +// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n) +// v60-v63: [16:31](gemm_m)x[448:463](gemm_n) +struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16 +{ + static constexpr index_t Block_M = 32; + static constexpr index_t Block_N = 512; + static constexpr index_t Block_K = 128; + + static constexpr index_t WarpPerBlock_M = 1; + static constexpr index_t WarpPerBlock_N = 4; + static constexpr index_t WarpPerBlock_K = 1; + + static constexpr index_t NumWarps = 4; + + static constexpr index_t Warp_M = 16; + static constexpr index_t Warp_N = 16; + static constexpr index_t Warp_K = 32; // 16 * SubKPacks + + static constexpr index_t BlockSize = 256; + + static constexpr index_t SubKPacks = 2; // this is used to gurantee every threads can do dwordx4 + + // TODO: note Nr/Kr/W need consider SubKPacks + static constexpr index_t Block_W = Warp_N * Warp_K; // 512 element + static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave + static constexpr index_t Block_Kr = Block_K / Warp_K; // 4 + + static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2 + static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8 + static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4 + + static CK_TILE_DEVICE constexpr auto MakeCBlockDist() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2, 1>, // !! note here is different + sequence<0, 0>>{}; + + using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + static CK_TILE_DEVICE constexpr auto MakeCBlockTile() + { + using CDataType = float; + constexpr auto c_block_dstr = MakeCBlockDist(); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A() + { + // A async->LDS + // constexpr index_t Block_M = Problem::BlockShape::Block_M0; + // constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + // constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack_ = 8; // GetSmemKPack_A(); // LDS + constexpr index_t KVector = 2; // GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack_; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK > NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_pass_through_transform(number{}), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + + // template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A() + { + // load from LDS to register, every wave has same layout + constexpr index_t KPack_ = 8; // GetSmemKPack_A(); // LDS + constexpr index_t KPad = KPack_; // pad between warps + + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + constexpr index_t kKIter = 2; + static_assert(KPack_ == (kABKPerLane * kKIter)); + + constexpr auto lds_block_desc_0 = + make_naive_tensor_descriptor(make_tuple(number{}, // m0 y + number{}, // m1 p + number{}, // k0 y + number{}, // k1 p + number{}), // k2 y-vector + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform( + make_tuple(number{}, number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + + static constexpr auto GetGemm_AWarpEnc() + { + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + constexpr index_t kKIter = 2; + + using enc_ = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>; + return enc_{}; + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return 32 * (128 + 8) * sizeof(bf16_t); + } +}; + +struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base +{ + using ADataType = bf16_t; + using BDataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + template + CK_TILE_DEVICE auto + operator()(const ARes& res_a, + const ACoords& cached_coords_a, + const BRes& res_b, + const BCoords& cached_coords_b, + CK_TILE_LDS_ADDR void* smem, + index_t k, + index_t tile_offset_a, // for each tile, the offset to move for each unroll + index_t tile_offset_b) // for each tile, the offset to move for each unroll + { + static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 + static_assert(BCoords::size() == Repeat_N); + + auto a_sst = make_tile_window( + make_tensor_view( + reinterpret_cast(smem), MakeLdsStoreDesc_A()), + MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sld = [&]() { + constexpr auto a_warp_enc_ = GetGemm_AWarpEnc(); + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = + detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_); + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), MakeLdsLoadDesc_A()), + MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType); + const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType); + + const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst); + constexpr auto smem_buf_size = + MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType); + static_assert(a_sld.get_num_of_access() == 8); + constexpr auto sld_os = generate_tuple( + [&](auto i_access) { + return number{}; + }, + number{}); + + index_t loop_cnt = k / Block_K; + + // this is the acc thread buffer + fp32x4_t v_acc[16]{.0f}; + + // B nr->kr +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + // clang-format off + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + : [s_loop_cnt]"+s"(loop_cnt), + [v_acc_0]"+v"(v_acc[0]), + [v_acc_1]"+v"(v_acc[1]), + [v_acc_2]"+v"(v_acc[2]), + [v_acc_3]"+v"(v_acc[3]), + [v_acc_4]"+v"(v_acc[4]), + [v_acc_5]"+v"(v_acc[5]), + [v_acc_6]"+v"(v_acc[6]), + [v_acc_7]"+v"(v_acc[7]), + [v_acc_8]"+v"(v_acc[8]), + [v_acc_9]"+v"(v_acc[9]), + [v_acc_10]"+v"(v_acc[10]), + [v_acc_11]"+v"(v_acc[11]), + [v_acc_12]"+v"(v_acc[12]), + [v_acc_13]"+v"(v_acc[13]), + [v_acc_14]"+v"(v_acc[14]), + [v_acc_15]"+v"(v_acc[15]), + [s_mem_]"+r"(smem) + : [s_res_a0]"s"(res_a[0]), + [s_res_a1]"s"(res_a[1]), + [s_res_a2]"s"(res_a[2]), + [s_res_a3]"s"(res_a[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_a0]"v"(static_cast(cached_coords_a[number<0>{}] * sizeof(ADataType))), + [v_os_a1]"v"(static_cast(cached_coords_a[number<1>{}] * sizeof(ADataType))), + [v_os_a2]"v"(static_cast(cached_coords_a[number<2>{}] * sizeof(ADataType))), + [v_os_a3]"v"(static_cast(cached_coords_a[number<3>{}] * sizeof(ADataType))), + [v_os_a4]"v"(static_cast(cached_coords_a[number<4>{}] * sizeof(ADataType))), + [v_os_a5]"v"(static_cast(cached_coords_a[number<5>{}] * sizeof(ADataType))), + [v_os_a6]"v"(static_cast(cached_coords_a[number<6>{}] * sizeof(ADataType))), + [v_os_a7]"v"(static_cast(cached_coords_a[number<7>{}] * sizeof(ADataType))), + + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [v_os_slda]"v"(static_cast(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))), + [s_m0_init]"s"(m0_init_value), + [s_size_per_issue]"s"(size_per_issue), + [smem_sz]"n"(smem_buf_size), //(smem_buf_size), + [sld_os_0]"n"(sld_os[number<0>{}].value), + [sld_os_1]"n"(sld_os[number<1>{}].value), + [sld_os_2]"n"(sld_os[number<2>{}].value), + [sld_os_3]"n"(sld_os[number<3>{}].value), + [sld_os_4]"n"(sld_os[number<4>{}].value), + [sld_os_5]"n"(sld_os[number<5>{}].value), + [sld_os_6]"n"(sld_os[number<6>{}].value), + [sld_os_7]"n"(sld_os[number<7>{}].value), + [s_tile_os_a]"s"(tile_offset_a_bytes), + [s_tile_os_b]"s"(tile_offset_b_bytes) + : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", + "s86", // s86 as tmp + "v64", "v65", "v66", "v67", "v68", "v69", + "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", + "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", + "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", + "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", + "v124", "v125", "v126", "v127" + ); + // clang-format on +#pragma clang diagnostic pop + + // return local scratch + auto c = MakeCBlockTile(); + for(auto i = 0; i < 16; i++) + { + c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; + c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; + c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; + c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; + } + return c; + } +}; + +struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base +{ + using ADataType = fp16_t; + using BDataType = fp16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + template + CK_TILE_DEVICE auto + operator()(const ARes& res_a, + const ACoords& cached_coords_a, + const BRes& res_b, + const BCoords& cached_coords_b, + CK_TILE_LDS_ADDR void* smem, + index_t k, + index_t tile_offset_a, // for each tile, the offset to move for each unroll + index_t tile_offset_b) // for each tile, the offset to move for each unroll + { + static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 + static_assert(BCoords::size() == Repeat_N); + + auto a_sst = make_tile_window( + make_tensor_view( + reinterpret_cast(smem), MakeLdsStoreDesc_A()), + MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sld = [&]() { + constexpr auto a_warp_enc_ = GetGemm_AWarpEnc(); + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = + detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_); + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), MakeLdsLoadDesc_A()), + MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType); + const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType); + + const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst); + constexpr auto smem_buf_size = + MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType); + static_assert(a_sld.get_num_of_access() == 8); + constexpr auto sld_os = generate_tuple( + [&](auto i_access) { + return number{}; + }, + number{}); + + index_t loop_cnt = k / Block_K; + + // this is the acc thread buffer + fp32x4_t v_acc[16]{.0f}; + + // B nr->kr +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + // clang-format off + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 +#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + : [s_loop_cnt]"+s"(loop_cnt), + [v_acc_0]"+v"(v_acc[0]), + [v_acc_1]"+v"(v_acc[1]), + [v_acc_2]"+v"(v_acc[2]), + [v_acc_3]"+v"(v_acc[3]), + [v_acc_4]"+v"(v_acc[4]), + [v_acc_5]"+v"(v_acc[5]), + [v_acc_6]"+v"(v_acc[6]), + [v_acc_7]"+v"(v_acc[7]), + [v_acc_8]"+v"(v_acc[8]), + [v_acc_9]"+v"(v_acc[9]), + [v_acc_10]"+v"(v_acc[10]), + [v_acc_11]"+v"(v_acc[11]), + [v_acc_12]"+v"(v_acc[12]), + [v_acc_13]"+v"(v_acc[13]), + [v_acc_14]"+v"(v_acc[14]), + [v_acc_15]"+v"(v_acc[15]), + [s_mem_]"+r"(smem) + : [s_res_a0]"s"(res_a[0]), + [s_res_a1]"s"(res_a[1]), + [s_res_a2]"s"(res_a[2]), + [s_res_a3]"s"(res_a[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_a0]"v"(static_cast(cached_coords_a[number<0>{}] * sizeof(ADataType))), + [v_os_a1]"v"(static_cast(cached_coords_a[number<1>{}] * sizeof(ADataType))), + [v_os_a2]"v"(static_cast(cached_coords_a[number<2>{}] * sizeof(ADataType))), + [v_os_a3]"v"(static_cast(cached_coords_a[number<3>{}] * sizeof(ADataType))), + [v_os_a4]"v"(static_cast(cached_coords_a[number<4>{}] * sizeof(ADataType))), + [v_os_a5]"v"(static_cast(cached_coords_a[number<5>{}] * sizeof(ADataType))), + [v_os_a6]"v"(static_cast(cached_coords_a[number<6>{}] * sizeof(ADataType))), + [v_os_a7]"v"(static_cast(cached_coords_a[number<7>{}] * sizeof(ADataType))), + + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [v_os_slda]"v"(static_cast(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))), + [s_m0_init]"s"(m0_init_value), + [s_size_per_issue]"s"(size_per_issue), + [smem_sz]"n"(smem_buf_size), //(smem_buf_size), + [sld_os_0]"n"(sld_os[number<0>{}].value), + [sld_os_1]"n"(sld_os[number<1>{}].value), + [sld_os_2]"n"(sld_os[number<2>{}].value), + [sld_os_3]"n"(sld_os[number<3>{}].value), + [sld_os_4]"n"(sld_os[number<4>{}].value), + [sld_os_5]"n"(sld_os[number<5>{}].value), + [sld_os_6]"n"(sld_os[number<6>{}].value), + [sld_os_7]"n"(sld_os[number<7>{}].value), + [s_tile_os_a]"s"(tile_offset_a_bytes), + [s_tile_os_b]"s"(tile_offset_b_bytes) + : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", + "s86", // s86 as tmp + "v64", "v65", "v66", "v67", "v68", "v69", + "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", + "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", + "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", + "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", + "v124", "v125", "v126", "v127" + ); + // clang-format on +#pragma clang diagnostic pop + + // return local scratch + auto c = MakeCBlockTile(); + for(auto i = 0; i < 16; i++) + { + c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; + c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; + c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; + c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; + } + return c; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp new file mode 100644 index 0000000000..203c87b9c6 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" + +namespace ck_tile { + +// "S"tream update output along "N" +// A in smem, B load from global +// require 4 wave, occupancy=1c +struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + static constexpr index_t Block_M = 32; + static constexpr index_t Block_N = 128; + static constexpr index_t Block_K = 512; + + static constexpr index_t WarpPerBlock_M = 1; + static constexpr index_t WarpPerBlock_N = 4; + static constexpr index_t WarpPerBlock_K = 1; + + static constexpr index_t Warp_M = 16; + static constexpr index_t Warp_N = 16; + static constexpr index_t Warp_K = 32; + + static constexpr index_t BlockSize = 256; + + // static constexpr index_t KPack = 2; // this is used to gurantee every threads can do dwordx4 + + // TODO: note Nr/Kr/W need consider KPack + static constexpr index_t Block_W = Warp_N * Warp_K; // 512 element + static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave + static constexpr index_t Block_Kr = Block_K / Warp_K; // 4 + + static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2 + static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 2 + static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 16 + + static CK_TILE_DEVICE constexpr auto MakeCBlockDist() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<2, 1>, // !! note here is different + sequence<0, 0>>{}; + + using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t); + } +}; + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + [s_loop_cnt]"+s"(loop_cnt), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base +{ + using BDataType = bf16_t; + using ODataType = bf16_t; + + // TODO: need paired with tile_window_linear! + // TODO: need call init_raw() before call this function! + // template + template + CK_TILE_DEVICE auto + operator()(const BRes& res_b, + const BCoords& cached_coords_b, + const ORes& res_o, + const OCoords& cached_coords_o, + const OFlags& o_flags, // this should be in sgpr + CK_TILE_LDS_ADDR void* smem, + index_t n, // loop along n dim + const ScaleTensor& scale_, + index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust + index_t tile_offset_o) + { + static_assert(BCoords::size() == 8); // 8 + static_assert(OCoords::size() == 8); + + const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType); + const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); + + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; + + index_t loop_cnt = n / Block_N; + + register float v_c0 asm("v64"); + register float v_c1 asm("v65"); + register float v_c2 asm("v66"); + register float v_c3 asm("v67"); + register float v_c4 asm("v68"); + register float v_c5 asm("v69"); + register float v_c6 asm("v70"); + register float v_c7 asm("v71"); + register float v_c8 asm("v72"); + register float v_c9 asm("v73"); + register float v_c10 asm("v74"); + register float v_c11 asm("v75"); + register float v_c12 asm("v76"); + register float v_c13 asm("v77"); + register float v_c14 asm("v78"); + register float v_c15 asm("v79"); + register float v_c16 asm("v80"); + register float v_c17 asm("v81"); + register float v_c18 asm("v82"); + register float v_c19 asm("v83"); + register float v_c20 asm("v84"); + register float v_c21 asm("v85"); + register float v_c22 asm("v86"); + register float v_c23 asm("v87"); + register float v_c24 asm("v88"); + register float v_c25 asm("v89"); + register float v_c26 asm("v90"); + register float v_c27 asm("v91"); + register float v_c28 asm("v92"); + register float v_c29 asm("v93"); + register float v_c30 asm("v94"); + register float v_c31 asm("v95"); + int32_t nan_hi = 0x7fff0000; + int32_t nan_lo = 0x00007fff; + + // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) + // every threads need 8xK in contiguous register + // ... and every wave need the same data + int lane_id = threadIdx.x % 64; + int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; + sld_y_os *= 2; + + // y y p p p y + // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) + // but order is N0*M0*Nv + // in LDS we need store as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) + // y y wave-id lid/16 lid%16 v + // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 + int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); + sfl_sst *= 2; + + // from LDS we need load as + // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) + // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) + // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 + int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; + sfl_sld *= 2; + + // B nr->kr + // clang-format off +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winline-asm" + asm volatile( +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 +#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" +#undef CK_TILE_FLATMM_UK_MFMA + :[smem_]"+r"(smem), + [s_loop_cnt]"+s"(loop_cnt), + [c0]"+v" (v_c0), + [c1]"+v" (v_c1), + [c2]"+v" (v_c2), + [c3]"+v" (v_c3), + [c4]"+v" (v_c4), + [c5]"+v" (v_c5), + [c6]"+v" (v_c6), + [c7]"+v" (v_c7), + [c8]"+v" (v_c8), + [c9]"+v" (v_c9), + [c10]"+v"(v_c10), + [c11]"+v"(v_c11), + [c12]"+v"(v_c12), + [c13]"+v"(v_c13), + [c14]"+v"(v_c14), + [c15]"+v"(v_c15), + [c16]"+v"(v_c16), + [c17]"+v"(v_c17), + [c18]"+v"(v_c18), + [c19]"+v"(v_c19), + [c20]"+v"(v_c20), + [c21]"+v"(v_c21), + [c22]"+v"(v_c22), + [c23]"+v"(v_c23), + [c24]"+v"(v_c24), + [c25]"+v"(v_c25), + [c26]"+v"(v_c26), + [c27]"+v"(v_c27), + [c28]"+v"(v_c28), + [c29]"+v"(v_c29), + [c30]"+v"(v_c30), + [c31]"+v"(v_c31) + : + [sld_a_base]"n"(0), + [shfl_base]"n"(0), + [v_sld_y_os]"v"(sld_y_os), + [v_sfl_sld]"v"(sfl_sld), + [v_sfl_sst]"v"(sfl_sst), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), + //[s_res_o2]"s"(res_o[2]), + //[s_res_o3]"s"(res_o[3]), + [s_res_b0]"s"(res_b[0]), + [s_res_b1]"s"(res_b[1]), + [s_res_b2]"s"(res_b[2]), + [s_res_b3]"s"(res_b[3]), + [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), + [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), + [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), + [v_os_o3]"v"(static_cast(cached_coords_o[number<3>{}] * sizeof(ODataType))), + [v_os_o4]"v"(static_cast(cached_coords_o[number<4>{}] * sizeof(ODataType))), + [v_os_o5]"v"(static_cast(cached_coords_o[number<5>{}] * sizeof(ODataType))), + [v_os_o6]"v"(static_cast(cached_coords_o[number<6>{}] * sizeof(ODataType))), + [v_os_o7]"v"(static_cast(cached_coords_o[number<7>{}] * sizeof(ODataType))), + [v_os_b0]"v"(static_cast(cached_coords_b[number<0>{}] * sizeof(BDataType))), + [v_os_b1]"v"(static_cast(cached_coords_b[number<1>{}] * sizeof(BDataType))), + [v_os_b2]"v"(static_cast(cached_coords_b[number<2>{}] * sizeof(BDataType))), + [v_os_b3]"v"(static_cast(cached_coords_b[number<3>{}] * sizeof(BDataType))), + [v_os_b4]"v"(static_cast(cached_coords_b[number<4>{}] * sizeof(BDataType))), + [v_os_b5]"v"(static_cast(cached_coords_b[number<5>{}] * sizeof(BDataType))), + [v_os_b6]"v"(static_cast(cached_coords_b[number<6>{}] * sizeof(BDataType))), + [v_os_b7]"v"(static_cast(cached_coords_b[number<7>{}] * sizeof(BDataType))), + + [s_tile_os_o]"s"(tile_stride_o_bytes), + [s_tile_os_b]"s"(tile_stride_b_bytes), + [scale_0]"v"(s0), + [scale_1]"v"(s1), + [v_nan_lo]"v"(nan_lo), + [v_nan_hi]"v"(nan_hi), + [s_execflag_0]"s"(o_flags[number<0>{}]), + [s_execflag_1]"s"(o_flags[number<1>{}]), + [s_execflag_2]"s"(o_flags[number<2>{}]), + [s_execflag_3]"s"(o_flags[number<3>{}]), + [s_execflag_4]"s"(o_flags[number<4>{}]), + [s_execflag_5]"s"(o_flags[number<5>{}]), + [s_execflag_6]"s"(o_flags[number<6>{}]), + [s_execflag_7]"s"(o_flags[number<7>{}]) + : + "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", + "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", + "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", + "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", + "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", + "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", + "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", + "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", + "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", + "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", + "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", + "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", + "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", + "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", + "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", + "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", + "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", + "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", + "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", + "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", + "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", + "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", + "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", + "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", + "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", + "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255", + "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", + "s36", "s37", + "v50", "v54", "v55", + "v64","v65","v66","v67","v68","v69","v70","v71", + "v72","v73","v74","v75","v76","v77","v78","v79", + "v80","v81","v82","v83","v84","v85","v86","v87", + "v88","v89","v90","v91","v92","v93","v94","v95", + "v128", "v129", "v130", "v131", + "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", + "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", + "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", + "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", + "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", + "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", + "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", + "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", + "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", + "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", + "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", + "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", + "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", + "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" + ); +#pragma clang diagnostic pop + // clang-format on + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp new file mode 100644 index 0000000000..003335c0e7 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#define CK_TILE_FLATMM_UK_MFMA_FP16 0 +#define CK_TILE_FLATMM_UK_MFMA_BF16 1 +#define CK_TILE_FLATMM_UK_MFMA_INT8 2 +#define CK_TILE_FLATMM_UK_MFMA_FP8 3 +#define CK_TILE_FLATMM_UK_MFMA_BF8 4 diff --git a/include/ck_tile/ops/flatmm/block/uk/README.md b/include/ck_tile/ops/flatmm/block/uk/README.md new file mode 100644 index 0000000000..84fa132296 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/README.md @@ -0,0 +1 @@ +the files under this folder should not be included directly! \ No newline at end of file diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc new file mode 100644 index 0000000000..8b57611f06 --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc @@ -0,0 +1,613 @@ +#ifndef CK_TILE_FLATMM_UK_MFMA +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#endif + +#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16 +# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \ + " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \ + " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \ + " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \ + " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \ + " v_perm_b32 " y_ ", v55, v54, s52 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16" + +#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16" + +# define _UK_PK_CVT_(x0_, x1_, y_) \ + " v_cvt_f16_f32 v54, " x0_ " \n" \ + " v_cvt_f16_f32 v55, " x1_ " \n" \ + " v_pack_b32_f16 " y_ ", v54, v55 \n" + +# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" + +#endif + + +";-------------------------------------------------------------\n" +" s_mov_b32 s52, 0x07060302 ; v_perm\n" +" s_mov_b64 s[38:39], exec ; save current exec\n" +" s_mov_b32 s8, %[s_res_o0] \n" +" s_mov_b32 s9, %[s_res_o1] \n" +" s_mov_b32 s12, %[s_res_b0] \n" +" s_mov_b32 s13, %[s_res_b1] \n" +" s_mov_b32 s14, %[s_res_b2] \n" +" s_mov_b32 s15, %[s_res_b3] \n" +" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n" +" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n" +" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n" +" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n" +" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n" +" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n" +" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n" +" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n" +" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n" +" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n" +" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n" +" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n" +" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n" +" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n" +" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n" +" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n" +" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n" +" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n" +" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n" +" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n" +" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n" +" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n" +" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n" +" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n" +" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n" +" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n" +" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n" +" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n" +" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n" +" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n" +" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n" +" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n" +" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n" +" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n" +" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n" +" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n" +" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n" +" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n" +" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n" +" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n" +" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n" +" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n" +" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n" +" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n" +" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n" +" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n" +" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n" +" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n" +" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n" +" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n" +" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n" +" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n" +" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n" +" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n" +" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n" +" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n" +" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n" +" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n" +" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n" +" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n" +" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n" +" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n" +" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n" +" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" +" s_waitcnt 0 \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_waitcnt 0 \n" +"L_start%=: \n" +" s_waitcnt vmcnt(32) \n" +" s_barrier \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n" +" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n" +" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n" +" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n" +" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n" +_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n" +" v_mul_f32 %[c0], %[scale_0], %[c0] \n" +" v_mul_f32 %[c1], %[scale_0], %[c1] \n" +" v_mul_f32 %[c2], %[scale_0], %[c2] \n" +" v_mul_f32 %[c3], %[scale_0], %[c3] \n" +" v_mul_f32 %[c4], %[scale_1], %[c4] \n" +" v_mul_f32 %[c5], %[scale_1], %[c5] \n" +" v_mul_f32 %[c6], %[scale_1], %[c6] \n" +" v_mul_f32 %[c7], %[scale_1], %[c7] \n" +" v_mul_f32 %[c8], %[scale_0], %[c8] \n" +" v_mul_f32 %[c9], %[scale_0], %[c9] \n" +" v_mul_f32 %[c10], %[scale_0], %[c10] \n" +" v_mul_f32 %[c11], %[scale_0], %[c11] \n" +" v_mul_f32 %[c12], %[scale_1], %[c12] \n" +" v_mul_f32 %[c13], %[scale_1], %[c13] \n" +" v_mul_f32 %[c14], %[scale_1], %[c14] \n" +" v_mul_f32 %[c15], %[scale_1], %[c15] \n" +_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]") +_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]") +_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]") +_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]") +_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]") +_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]") +_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") +_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]") +" ;------------------------------ \n" +" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] \n" +" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] \n" +" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] \n" +" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] \n" +" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] \n" +" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] \n" +" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] \n" +" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], %[c0], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], %[c1], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], %[c2], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], %[c3], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], %[c4], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], %[c5], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], %[c6], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], %[c7], s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_add_u32 s8, %[s_tile_os_o], s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" s_waitcnt vmcnt(32) \n" +" s_barrier \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n" +_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n" +" v_mul_f32 %[c16], %[scale_0], %[c16] \n" +" v_mul_f32 %[c17], %[scale_0], %[c17] \n" +" v_mul_f32 %[c18], %[scale_0], %[c18] \n" +" v_mul_f32 %[c19], %[scale_0], %[c19] \n" +" v_mul_f32 %[c20], %[scale_1], %[c20] \n" +" v_mul_f32 %[c21], %[scale_1], %[c21] \n" +" v_mul_f32 %[c22], %[scale_1], %[c22] \n" +" v_mul_f32 %[c23], %[scale_1], %[c23] \n" +" v_mul_f32 %[c24], %[scale_0], %[c24] \n" +" v_mul_f32 %[c25], %[scale_0], %[c25] \n" +" v_mul_f32 %[c26], %[scale_0], %[c26] \n" +" v_mul_f32 %[c27], %[scale_0], %[c27] \n" +" v_mul_f32 %[c28], %[scale_1], %[c28] \n" +" v_mul_f32 %[c29], %[scale_1], %[c29] \n" +" v_mul_f32 %[c30], %[scale_1], %[c30] \n" +" v_mul_f32 %[c31], %[scale_1], %[c31] \n" + +_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]") +_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]") +_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]") +_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]") +_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]") +_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]") +_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") +_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]") + +" ;------------------------------ \n" +" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base] \n" +" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_barrier \n" +" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base] \n" +" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base] \n" +" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base] \n" +" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base] \n" +" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base] \n" +" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base] \n" +" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base] \n" +" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base] \n" +" s_waitcnt lgkmcnt(0) \n" +" s_mov_b64 exec, %[s_execflag_0] \n" +_UK_ATOMIC_ADD_ " %[v_os_o0], %[c16], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_1] \n" +_UK_ATOMIC_ADD_ " %[v_os_o1], %[c17], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_2] \n" +_UK_ATOMIC_ADD_ " %[v_os_o2], %[c18], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_3] \n" +_UK_ATOMIC_ADD_ " %[v_os_o3], %[c19], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_4] \n" +_UK_ATOMIC_ADD_ " %[v_os_o4], %[c20], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_5] \n" +_UK_ATOMIC_ADD_ " %[v_os_o5], %[c21], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_6] \n" +_UK_ATOMIC_ADD_ " %[v_os_o6], %[c22], s[8:9] \n" +" s_mov_b64 exec, %[s_execflag_7] \n" +_UK_ATOMIC_ADD_ " %[v_os_o7], %[c23], s[8:9] \n" +" s_mov_b64 exec, s[38:39] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s12, s86, s12 \n" +" s_addc_u32 s13, 0, s13 \n" +" s_add_u32 s8, %[s_tile_os_o], s8 \n" +" s_addc_u32 s9, 0, s9 \n" +" s_branch L_start%= \n" +"L_end%=: \n" + +#undef _UK_MFMA_ +#undef _UK_PK_CVT_ +#undef _UK_ATOMIC_ADD_ diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc new file mode 100644 index 0000000000..a34a21d39f --- /dev/null +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc @@ -0,0 +1,516 @@ +#ifndef CK_TILE_FLATMM_UK_MFMA +#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 +#endif + +#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16" +#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16 +#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16" +#endif + +"s_mov_b32 s16, %[s_res_a0] \n" +"s_mov_b32 s17, %[s_res_a1] \n" +"s_mov_b32 s18, %[s_res_a2] \n" +"s_mov_b32 s19, %[s_res_a3] \n" +"s_mov_b32 s20, %[s_res_b0] \n" +"s_mov_b32 s21, %[s_res_b1] \n" +"s_mov_b32 s22, %[s_res_b2] \n" +"s_mov_b32 s23, %[s_res_b3] \n" +// "s_nop 4\n" +"; -- prefetch A0\n" +"s_add_u32 m0, 0, %[s_m0_init] \n" +"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" +"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond \n" +"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond \n" +"s_add_u32 s16, s86, s16 ; move a with cond \n" +"s_addc_u32 s17, 0, s17 ; move a with cond \n" +"; -- prefetch A1\n" +"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +"s_add_u32 m0, %[s_size_per_issue], m0 \n" +"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +"s_add_u32 m0, 0, %[s_m0_init] \n" +"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond \n" +"s_add_u32 s16, s86, s16 ; move a with cond \n" +"s_addc_u32 s17, 0, s17 ; move a with cond \n" +"; -- prefetch B0\n" +"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" +"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" +"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond \n" +"s_add_u32 s20, s86, s20 ; move b with cond \n" +"s_addc_u32 s21, 0, s21 ; move b with cond \n" +"s_waitcnt vmcnt(40) \n" +"s_barrier \n" +"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 K stride +"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n" +"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n" +"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n" +"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n" +"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n" +"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n" +"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n" +"L_start%=: \n" +" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" +" s_barrier \n" +_UK_MFMA_ " %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n" +" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0] \n" +_UK_MFMA_ " %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n" +" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1] \n" +_UK_MFMA_ " %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n" +" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2] \n" +_UK_MFMA_ " %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n" +" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3] \n" +_UK_MFMA_ " %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n" +" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4] \n" +_UK_MFMA_ " %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n" +" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5] \n" +_UK_MFMA_ " %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n" +" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6] \n" +_UK_MFMA_ " %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n" +" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n" +_UK_MFMA_ " %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +" s_cselect_b32 s86, %[s_tile_os_a], 0 \n" +" s_add_u32 s16, s86, s16 \n" +" s_addc_u32 s17, 0, s17 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s20, s86, s20 \n" +" s_addc_u32 s21, 0, s21 \n" +" ;------------------------------------------ \n" +" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" +" s_barrier \n" +_UK_MFMA_ " %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n" +_UK_MFMA_ " %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n" +" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n" +_UK_MFMA_ " %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n" +" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n" +_UK_MFMA_ " %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n" +" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds \n" +" s_add_u32 m0, %[s_size_per_issue], m0 \n" +_UK_MFMA_ " %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n" +_UK_MFMA_ " %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n" +" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds \n" +" s_add_u32 m0, 0, %[s_m0_init] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n" +" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0] \n" +_UK_MFMA_ " %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n" +_UK_MFMA_ " %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n" +" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1] \n" +_UK_MFMA_ " %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n" +" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2] \n" +_UK_MFMA_ " %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n" +_UK_MFMA_ " %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n" +" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3] \n" +_UK_MFMA_ " %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n" +" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4] \n" +_UK_MFMA_ " %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n" +_UK_MFMA_ " %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n" +" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5] \n" +_UK_MFMA_ " %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n" +" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6] \n" +_UK_MFMA_ " %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n" +_UK_MFMA_ " %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n" +" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n" +_UK_MFMA_ " %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n" +_UK_MFMA_ " %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n" +_UK_MFMA_ " %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n" +_UK_MFMA_ " %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n" +" s_waitcnt vmcnt(32) \n" +_UK_MFMA_ " %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n" +_UK_MFMA_ " %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n" +_UK_MFMA_ " %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" +_UK_MFMA_ " %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" +_UK_MFMA_ " %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n" +_UK_MFMA_ " %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" +_UK_MFMA_ " %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" +_UK_MFMA_ " %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n" +_UK_MFMA_ " %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n" +" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 0 \n" +" s_cbranch_scc0 L_end%= \n" +" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n" +" s_cselect_b32 s86, %[s_tile_os_a], 0 \n" +" s_add_u32 s16, s86, s16 \n" +" s_addc_u32 s17, 0, s17 \n" +" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n" +" s_cselect_b32 s86, %[s_tile_os_b], 0 \n" +" s_add_u32 s20, s86, s20 \n" +" s_addc_u32 s21, 0, s21 \n" +" s_branch L_start%= \n" +"L_end%=: \n" +" s_nop 2 \n" + +#undef _UK_MFMA_ diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp index 10bb01168f..173887513e 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp @@ -331,7 +331,8 @@ struct BlockFmhaPipelineQRKSVSAsync Policy::template MakeVDramTileDistribution()); // prefetch K tile - async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np); + async_load_tile_raw( + k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, number<-1>{}, k_oob_ck, k_pre_np); move_tile_window(k_dram_window, {0, kK0}); __builtin_amdgcn_sched_barrier(0); @@ -355,6 +356,7 @@ struct BlockFmhaPipelineQRKSVSAsync static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) { async_load_tile_raw(k_lds_store(number{})>{}), k_dram_window, + number<-1>{}, k_oob_ck, k_pre_np); if constexpr(i_k0 < k0_loops - 1) @@ -386,7 +388,7 @@ struct BlockFmhaPipelineQRKSVSAsync __builtin_amdgcn_s_barrier(); const auto bias_tile = load_tile(bias_dram_window); // load bias tile - auto v_buf = load_tile(v_dram_window, bool_constant{}); + auto v_buf = load_tile(v_dram_window, number<-1>{}, bool_constant{}); __builtin_amdgcn_sched_barrier(0); { // tail gemm_0(s_acc, @@ -514,7 +516,8 @@ struct BlockFmhaPipelineQRKSVSAsync move_tile_window( v_dram_window, {0, kK1}); // will have scratch if move this right after load_tile(v_dram)... - v_buf = load_tile(v_dram_window, bool_constant{}); // load next v_buf + v_buf = load_tile( + v_dram_window, number<-1>{}, bool_constant{}); // load next v_buf } __builtin_amdgcn_sched_barrier(0); @@ -618,7 +621,8 @@ struct BlockFmhaPipelineQRKSVSAsync static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) { if constexpr(i_k1 != 0 && i_k1 < k1_loops - 1) { - v_buf = load_tile(v_dram_window, bool_constant{}); // load next v_buf + v_buf = load_tile( + v_dram_window, number<-1>{}, bool_constant{}); // load next v_buf } block_sync_lds(); gemm_1(o_acc, @@ -665,8 +669,11 @@ struct BlockFmhaPipelineQRKSVSAsync if constexpr(k1_loops >= 2 && LdsSeq.at(number<0>{}) == LdsSeq.at(number{})) __builtin_amdgcn_s_barrier(); - async_load_tile_raw( - k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np); + async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), + k_dram_window, + number<-1>{}, + k_oob_ck, + k_pre_np); move_tile_window(k_dram_window, {0, kK0}); } // tail diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp index b74607f061..d23af0af8d 100644 --- a/include/ck_tile/ops/fused_moe.hpp +++ b/include/ck_tile/ops/fused_moe.hpp @@ -3,7 +3,15 @@ #pragma once +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp" +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp" +#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp" #include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp new file mode 100644 index 0000000000..2d25d44f3c --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/elementwise.hpp" +#include +#include + +// clang-format off +// [indexing implementation-1] +// using M_a as constexpr block_size to partition all tokens into different slices +// each slice map to one expert, and one expert can have multiple slices +// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5 +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1) +// * this could be larger than actual, since actual tokens are on GPU +// +// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5] +// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4 -|- exp-5 -| +// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o] +// +// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr +// +// * Note on token_id_per_expert/sorted_token_ids_ptr data: +// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr. +// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from +// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr +// +// 32bit 0........23 24.....31 bit +// (data) -> (token_id | topk_id) +// low 24 bit is for token id, top 8 bit is for topk id +// +// the input after smooth-quant is [token, topk, hidden_dim], originally it is [token, hidden_dim] +// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim] +// +// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5] +// * length is (max_num_tokens_padded + block_size - 1) / block_size +// +// num_tokens_post_padded_ptr : [28] +// num_sorted_tiles_ptr : [7] +// +// * different from vLLM +// 1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id +// 2)need sorted_weight_ptr +// 3) use num_sorted_tiles_ptr, already divided by M_a +// +// * below used for indexing +// 1) sorted_token_ids_ptr [max_num_tokens_padded] +// 2) sorted_weight_ptr +// 3) sorted_expert_ids_ptr +// 4)num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one) +// +// max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1) +// +// [indexing implementation-2] +// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]] +// tok-0 tok-1 tok-2 tok-3 tok-4 +// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number) +// +// we generate original rol/col id as +// topk_rc_ids : [[0, 5, A], [1, 6, B], [2, 7, C], [3, 8, D], [4, 9, E]] +// let x be one element of above, we can get: +// tpok_row_id(token_id) = x % num_tokens(5) +// tpok_col_id(expert_Id) = x / num_tokens +// topk_row_id/col_id can be used to access original topk_ids/topk_weight +// +// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 5, 5]] +// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5 +// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]] +// +// we can get permuted_rc_ids: +// [[0], [2, 3, 4], [1, 8], [5, 6, 7, D, 9], [], [A, B, C, E]] +// +// +// clang-format on +// +namespace ck_tile { + +// m: num_tokens (or token*input-batch) +// k: intermediate_size +// n: intermediate_size used between 2 FC (TP slice this) +// e: num expert +// if doing pre-shuffle +// nr : n / Block_Nr +// kr : k / Block_Kr +// w : fattened 1d wave buffer +struct FusedMoeGemmHostArgs +{ + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token + + const void* sorted_token_ids_ptr; // [max_num_tokens_padded] + const void* sorted_weight_ptr; // [max_num_tokens_padded] + const void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size] + const void* num_sorted_tiles_ptr; // [1] + + index_t hidden_size; // k + index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + index_t num_tokens; // input number of tokens for current iteration + index_t num_experts; // number of groups + index_t topk; // need this? + + index_t stride_token; // for input/output, stride for each row, should >= hidden_size +}; + +// This is scatter/gather b2b group-gemm +template +struct FusedMoeGemmKernel +{ + using Partitioner = remove_cvref_t; + using Pipeline = remove_cvref_t; + using Epilogue = remove_cvref_t; // TODO: not used + // static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu; + // static_assert(kBlockPerCu > 0); + + using BlockShape = typename Pipeline::BlockShape; // this is FusedMoeGemmShape + static constexpr index_t BlockSize_ = BlockShape::BlockSize; + + using ADataType = typename Pipeline::Problem::ADataType; + using GDataType = typename Pipeline::Problem::GDataType; + using DDataType = typename Pipeline::Problem::DDataType; + using AccDataType = typename Pipeline::Problem::AccDataType; + using ODataType = typename Pipeline::Problem::ODataType; + using AScaleDataType = typename Pipeline::Problem::AScaleDataType; + using GScaleDataType = typename Pipeline::Problem::GScaleDataType; + using DScaleDataType = typename Pipeline::Problem::DScaleDataType; + using YSmoothScaleDataType = typename Pipeline::Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Pipeline::Problem::TopkWeightDataType; + using IndexDataType = typename Pipeline::Problem::IndexDataType; + using YDataType = typename Pipeline::Problem::YDataType; + + using Traits = typename Pipeline::Problem::Traits; + static constexpr bool UseUK = true; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + // clang-format off + template struct t2s; + template <> struct t2s { static constexpr const char * name = "fp32"; }; + template <> struct t2s { static constexpr const char * name = "fp16"; }; + template <> struct t2s { static constexpr const char * name = "bf16"; }; + template <> struct t2s { static constexpr const char * name = "fp8"; }; + template <> struct t2s { static constexpr const char * name = "bf8"; }; + template <> struct t2s { static constexpr const char * name = "int8"; }; + // clang-format on + + CK_TILE_HOST static std::string GetName() + { +#define _SS_ std::string +#define _TS_ std::to_string + // clang-format off + using S_ = BlockShape; + + auto prec_str = [&] () { + std::string base_str = _SS_(t2s::name); + if (!std::is_same_v) { + base_str += _SS_("_") + _SS_(t2s::name); + } + return base_str; + }(); + + return _SS_("fused_moe_") + _SS_(prec_str) + "_" + + _TS_(S_::Block_M0) + "x" + _TS_(S_::Block_N0) + "x" + _TS_(S_::Block_K0) + "x" + _TS_(S_::Block_N1) + "_" + + _TS_(S_::WarpPerBlock_M0) + "x" + _TS_(S_::WarpPerBlock_N0) + "x" + _TS_(S_::WarpPerBlock_K0) + "_" + + _TS_(S_::Warp_M0) + "x" + _TS_(S_::Warp_N0) + "x" + _TS_(S_::Warp_K0) + "_" + _SS_(Pipeline::name); +#undef _SS_ +#undef _TS_ + // clang-format on + } + + struct FusedMoeGemmKargs + { + const void* a_ptr; // [m, k], input token + const void* a_scale_ptr; // [m, 1], token scale + const void* g_ptr; // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + const void* d_ptr; // [e, n, k], pre-shuffle([e, nr, kr, w]) + const void* g_scale_ptr; // [e, 1, n], gate(up) scale + const void* d_scale_ptr; // [e, 1, k], down scale + const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input + void* o_ptr; // [m, k], output token + + const void* sorted_token_ids_ptr; + const void* sorted_weight_ptr; + const void* sorted_expert_ids_ptr; + const void* num_sorted_tiles_ptr; + + index_t hidden_size; // k + index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2 + index_t num_tokens; // input number of tokens for current iteration + index_t num_experts; // number of groups + index_t topk; // need this? + + index_t stride_token; // for input/output, stride for each row, should >= hidden_size + }; + + // TODO: switch karg based on + using Kargs = FusedMoeGemmKargs; + using Hargs = FusedMoeGemmHostArgs; + + CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) + { + // TODO: hargs/kargs not guranteed to be the same + return bit_cast(hargs); + } + + CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) + { + constexpr index_t block_m = BlockShape::Block_M0; + int max_num_tokens_padded = + hargs.topk * hargs.num_tokens + hargs.num_experts * block_m - hargs.topk; + // printf("xxx max_num_tokens_padded:%d\n", max_num_tokens_padded); + return Partitioner::GridSize(max_num_tokens_padded, hargs.intermediate_size); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(BlockSize_); } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + if constexpr(UseUK) + { + __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()]; + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + *reinterpret_cast(kargs.num_sorted_tiles_ptr)); + + num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0; + + const auto [sorted_tile_id, intermediate_tile_id] = + Partitioner{}(num_sorted_tiles, kargs.intermediate_size); + // if(threadIdx.x == 0) + // printf("bid:%d,%d, num_sorted_tiles:%d, sorted_tile_id:%d(%d), + // intermediate_tile_id:%d\n", static_cast(blockIdx.x), + // static_cast(blockIdx.y), num_sorted_tiles, sorted_tile_id, sorted_tile_id >= + // num_sorted_tiles? 1 : 0, intermediate_tile_id); + if(sorted_tile_id >= num_sorted_tiles) + return; + + Pipeline{}(kargs, smem, sorted_tile_id, intermediate_tile_id); + } + else + { + // allocate LDS + // __shared__ char smem_ptr[GetSmemSize()]; + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + *reinterpret_cast(kargs.num_sorted_tiles_ptr)); + constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; + + index_t nr_0 = kargs.intermediate_size / BlockShape::Block_Nr0; + index_t kr_0 = kargs.hidden_size / BlockShape::Block_Kr0; + index_t nr_1 = kargs.hidden_size / BlockShape::Block_Nr1; // should be same as kr_0 + index_t kr_1 = + kargs.intermediate_size / BlockShape::Block_Kr1; // should be same as nr_0 + + index_t expert_stride_0 = kargs.intermediate_size * hidden_radio_0 * kargs.hidden_size; + index_t expert_stride_1 = kargs.intermediate_size * kargs.hidden_size; + + __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()]; + + // note this is in unit of tile, need multiple tile size to get the index + const auto [sorted_tile_id, intermediate_tile_id] = + Partitioner{}(num_sorted_tiles, kargs.intermediate_size); + if(sorted_tile_id >= num_sorted_tiles) + return; + + const IndexDataType expert_id = + __builtin_amdgcn_readfirstlane(reinterpret_cast( + kargs.sorted_expert_ids_ptr)[sorted_tile_id]); + + // index along intermediate_size + // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id * + // BlockShape::Block_N0); + index_t interm_idx_nr = + __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0); + + const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col] + const auto sorted_token_id = + a_coord[number<0>{}] + sorted_tile_id * BlockShape::Block_M0; + + index_t token_id = + reinterpret_cast(kargs.sorted_token_ids_ptr)[sorted_token_id]; + auto topk_weight = reinterpret_cast( + kargs.sorted_weight_ptr)[sorted_token_id]; + + const auto a_window = [&]() { + // A is already pre-padded in previous kernel + const ADataType* a_ptr = reinterpret_cast(kargs.a_ptr); + const auto a_view_ = make_naive_tensor_view( + a_ptr, + make_tuple(kargs.num_tokens, kargs.hidden_size), + make_tuple(kargs.stride_token, 1), + number{}, + number<1>{}); + + // gather is here use indexing transform + const auto a_gather_view_ = transform_tensor_view( + a_view_, + make_tuple(make_indexing_transform(kargs.num_tokens, token_id), + make_pass_through_transform(kargs.hidden_size)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + const auto a_window_ = make_tile_window( + a_gather_view_, + make_tuple(number{}, number{}), + {0, 0}); + return a_window_; + }(); + + // TODO: gtile using NSub to have less register pressure + const auto g_window = [&]() { + const GDataType* g_ptr = reinterpret_cast(kargs.g_ptr) + + static_cast(expert_id) * expert_stride_0 + + interm_idx_nr * kr_0 * BlockShape::Block_W0; + const auto g_view_ = make_naive_tensor_view( + g_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + const auto g_view_1_ = + pad_tensor_view(g_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + + const auto g_window_ = make_tile_window(g_view_1_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}); + return g_window_; + }(); + + const auto d_window = [&]() { + const DDataType* d_ptr = reinterpret_cast(kargs.d_ptr) + + static_cast(expert_id) * expert_stride_1 + + interm_idx_nr * BlockShape::Block_W1; + // note interm_idx_nr is along the gemm-k dim of 2nd gemm + + const auto d_view_ = make_naive_tensor_view( + d_ptr, + make_tuple(nr_1, kr_1, BlockShape::Block_W1), + make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1), + number{}, + number<1>{}); + const auto d_view_1_ = + pad_tensor_view(d_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + + const auto d_window_ = make_tile_window(d_view_1_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}); + return d_window_; + }(); + + auto o_window = [&]() { + ODataType* o_ptr = reinterpret_cast(kargs.o_ptr); + auto o_view_ = make_naive_tensor_view( + o_ptr, + make_tuple(kargs.num_tokens, kargs.hidden_size), + make_tuple(kargs.stride_token, 1), + number{}, + number<1>{}); + + // gather is here + auto o_scatter_view_ = transform_tensor_view( + o_view_, + make_tuple(make_indexing_transform(kargs.num_tokens, token_id), + make_pass_through_transform(kargs.hidden_size)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + auto o_window_ = make_tile_window( + o_scatter_view_, + make_tuple(number{}, number{}), + {0, 0}); + return o_window_; + }(); + + // do compute yeah + Pipeline{}(a_window, + g_window, + d_window, + o_window, + topk_weight, + smem, + kargs.hidden_size, + kargs.intermediate_size, + kargs.stride_token); + } + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp new file mode 100644 index 0000000000..4f3f8bb7d3 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +/* +tensors: +1. act (A): input feature map +2. gate (G): B matrix for first gemm, output will do activation(Silu) +3. up (U): B matrix for first gemm +4. down (D): B matrix for second gemm + N1 + / \ + +----------+ | + | Down | | + x----------x | + hidden hidden K1 | | | + N0 N0 x----------x | + | +------x-----x------+------x-----x------+ | | | + dim | | Gate | | | Up | | | | | | + contiguous | | | | | | | | | | | + | | | | | | | | | | | + v +------x-----x------+------x-----x------+ +----------+ V + K0 | | | | | contiguous + / \ v v v v | + +---------+ +------x-----x------+------x-----x------+ | +M0 | A | | | | | | | | | + +---------+ +------x-----x------+------x-----x------+ | + ----------> | | | + contiguous | V V + | x-----x +----------+ + +------------> M1 | Y | ---------> | Out(O) | + ACT x-----x +----------+ + K1 = N0 dim + +* Note: Act could be Gelu/Silu/... +* Note: some model does not have Up +*/ +template +struct FusedMoeGemmShape +{ + using BlockTile_0 = remove_cvref_t; + using WarpPerBlock_0 = remove_cvref_t; + using WarpTile_0 = remove_cvref_t; + using BlockTile_1 = remove_cvref_t; + using WarpPerBlock_1 = remove_cvref_t; + using WarpTile_1 = remove_cvref_t; + + static constexpr index_t NumWarps = + reduce_on_sequence(WarpPerBlock_0{}, multiplies{}, number<1>{}); + + // TODO: we don't support half warps aound to 1 warp here + static_assert(NumWarps == reduce_on_sequence(WarpPerBlock_1{}, multiplies{}, number<1>{})); + + static constexpr index_t Block_M0 = BlockTile_0::at(number<0>{}); + static constexpr index_t Block_N0 = BlockTile_0::at(number<1>{}); + static constexpr index_t Block_K0 = BlockTile_0::at(number<2>{}); + static constexpr index_t WarpPerBlock_M0 = WarpPerBlock_0::at(number<0>{}); + static constexpr index_t WarpPerBlock_N0 = WarpPerBlock_0::at(number<1>{}); + static constexpr index_t WarpPerBlock_K0 = WarpPerBlock_0::at(number<2>{}); + static constexpr index_t Warp_M0 = WarpTile_0::at(number<0>{}); + static constexpr index_t Warp_N0 = WarpTile_0::at(number<1>{}); + static constexpr index_t Warp_K0 = WarpTile_0::at(number<2>{}); + + static constexpr index_t ThreadPerBlock_M0 = Warp_M0 * WarpPerBlock_M0; + static constexpr index_t ThreadPerBlock_N0 = Warp_N0 * WarpPerBlock_N0; + static constexpr index_t ThreadPerBlock_K0 = Warp_K0 * WarpPerBlock_K0; + static_assert(Block_M0 % ThreadPerBlock_M0 == 0); + static_assert(Block_N0 % ThreadPerBlock_N0 == 0); + static_assert(Block_K0 % ThreadPerBlock_K0 == 0); + static constexpr index_t Repeat_M0 = Block_M0 / ThreadPerBlock_M0; + static constexpr index_t Repeat_N0 = Block_N0 / ThreadPerBlock_N0; + static constexpr index_t Repeat_K0 = Block_K0 / ThreadPerBlock_K0; + + static constexpr index_t Block_M1 = BlockTile_1::at(number<0>{}); + static constexpr index_t Block_N1 = BlockTile_1::at(number<1>{}); + static constexpr index_t Block_K1 = BlockTile_1::at(number<2>{}); + static constexpr index_t WarpPerBlock_M1 = WarpPerBlock_1::at(number<0>{}); + static constexpr index_t WarpPerBlock_N1 = WarpPerBlock_1::at(number<1>{}); + static constexpr index_t WarpPerBlock_K1 = WarpPerBlock_1::at(number<2>{}); + static constexpr index_t Warp_M1 = WarpTile_1::at(number<0>{}); + static constexpr index_t Warp_N1 = WarpTile_1::at(number<1>{}); + static constexpr index_t Warp_K1 = WarpTile_1::at(number<2>{}); + + static constexpr index_t ThreadPerBlock_M1 = Warp_M1 * WarpPerBlock_M1; + static constexpr index_t ThreadPerBlock_N1 = Warp_N1 * WarpPerBlock_N1; + static constexpr index_t ThreadPerBlock_K1 = Warp_K1 * WarpPerBlock_K1; + static_assert(Block_M1 % ThreadPerBlock_M1 == 0); + static_assert(Block_N1 % ThreadPerBlock_N1 == 0); + static_assert(Block_K1 % ThreadPerBlock_K1 == 0); + static constexpr index_t Repeat_M1 = Block_M1 / ThreadPerBlock_M1; + static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1; + static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1; + + static constexpr index_t BlockSize = warpSize * NumWarps; + + // some assert + static_assert(Block_M0 == Block_M1); + static_assert(Block_N0 == Block_K1 || (Block_N0 / 2) == Block_K1); // Gate Only or Gate+Up + + // pre-shuffle tile size compute (assume only for B matrix) + // we flatten the each wave tile to a 1d linear tensor(at model loading time) + // e.g. originally we have Block_N*Block_K tile size, after pre-shuffle + // we can have Block_Nr*Block_Kr*Block_W, where Block_W is Warp_N*Warp_K, + // and Block_Nr=Block_N/Warp_N, Block_Kr=Block_K/Warp_K + static constexpr index_t Block_W0 = Warp_N0 * Warp_K0; + static constexpr index_t Block_Nr0 = Block_N0 / Warp_N0; + static constexpr index_t Block_Kr0 = Block_K0 / Warp_K0; + static constexpr index_t Block_W1 = Warp_N1 * Warp_K1; + static constexpr index_t Block_Nr1 = Block_N1 / Warp_N1; + static constexpr index_t Block_Kr1 = Block_K1 / Warp_K1; + + static_assert(Block_W0 == Block_W1); + // static_assert(Block_Nr0 == Block_Kr1); +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp new file mode 100644 index 0000000000..381edb650d --- /dev/null +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck_tile { + +template +struct FusedMoeGemmTilePartitioner_Linear +{ + // FusedMoeGemmShape + using BlockShape = ck_tile::remove_cvref_t; + + static constexpr const char* name = "lin"; + + CK_TILE_DEVICE auto operator()(ck_tile::index_t /*num_sorted_tiles*/, + ck_tile::index_t /*intermediate_size*/) + { + index_t i_n = blockIdx.x; + index_t i_m = blockIdx.y; + + return ck_tile::make_tuple(i_m, i_n); + } + + CK_TILE_HOST static constexpr auto GridSize(index_t max_tokens, index_t intermediate_size) + { + // TODO: this may need tuning + index_t ms = ck_tile::integer_divide_ceil(max_tokens, BlockShape::Block_M0); + index_t ns = ck_tile::integer_divide_ceil(intermediate_size, BlockShape::Block_N0); + return dim3(ns, ms, 1); + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp new file mode 100644 index 0000000000..e9577e2304 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp @@ -0,0 +1,651 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" + +namespace ck_tile { + +/* +This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight) +we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave) + + <----- gemm-N ------> + +----+----+----+----+ + | w0 | w1 | w2 | w3 | gemm-m + +----+----+----+----+ +*/ +template +struct FusedMoeGemmPipeline_FlatmmEx +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape + + using ADataType = typename Problem::ADataType; + using GDataType = typename Problem::GDataType; + using DDataType = typename Problem::DDataType; + using AccDataType = typename Problem::AccDataType; + using ODataType = typename Problem::ODataType; + using AScaleDataType = typename Problem::AScaleDataType; + using GScaleDataType = typename Problem::GScaleDataType; + using DScaleDataType = typename Problem::DScaleDataType; + using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Problem::TopkWeightDataType; + using IndexDataType = typename Problem::IndexDataType; + using YDataType = typename Problem::YDataType; + + using Traits = typename Problem::Traits; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + static constexpr index_t kAlignmentA = Policy::template GetAlignment_A(); + static constexpr index_t kAlignmentG = Policy::template GetAlignment_G(); + static constexpr index_t kAlignmentD = Policy::template GetAlignment_D(); + static constexpr index_t kAlignmentO = Policy::template GetAlignment_O(); + + static constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + static constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + static constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + static constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + + static constexpr index_t kBlockPerCu = []() { + if constexpr(Problem::kBlockPerCu != -1) + return Problem::kBlockPerCu; + else + { + // minimize occupancy + return 2; + } + }(); + + static constexpr const char* name = "fused_moe_flatmm"; + + // TODO: there are multiple buffers + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A() + { + return Policy::template GetSmemSize_A(); + } + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetACoord() + { + constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A(); + const auto a_coord = a_dist.calculate_index(); + return a_coord; + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetOCoord() + { + constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution(); + const auto o_coord = o_dist.calculate_index(); + return o_coord; + } + + template + CK_TILE_DEVICE auto operator()(const AWindow& a_window_, + const GWindow& g_window_, + const DWindow& d_window_, + OWindow& o_window_, + TopkWeightDataType /*topk_weight*/, + CK_TILE_LDS_ADDR void* smem, + index_t hidden_size, + index_t intermediate_size) + { + _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wc++20-extensions\""); + constexpr auto NEG1 = number<-1>{}; + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + constexpr auto TRUE = bool_constant{}; + constexpr auto FALSE = bool_constant{}; + + CK_TILE_LDS_ADDR ADataType* smem_0 = reinterpret_cast(smem); + CK_TILE_LDS_ADDR ADataType* smem_1 = reinterpret_cast( + reinterpret_cast(smem) + + Policy::template GetSmemSize_A()); + + auto g_view = g_window_.get_bottom_tensor_view(); + + auto u_view = [&]() { + if constexpr(IsGateOnly) + { + return g_view; + } + else + { + index_t nr_0 = intermediate_size / BlockShape::Block_Nr0; + index_t kr_0 = hidden_size / BlockShape::Block_Kr0; + + const GDataType* g_ptr = + g_window_.get_bottom_tensor_view().get_buffer_view().p_data_; + const GDataType* u_ptr = g_ptr + (nr_0 / 2) * kr_0 * number{}; + + const auto u_view_ = make_naive_tensor_view( + u_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + const auto u_view_1_ = + pad_tensor_view(u_view_, + make_tuple(number{}, + number{}, + number{}), + sequence{}); + return u_view_1_; + } + }(); + + auto a_win = make_tile_window_linear( + a_window_, Policy::template MakeGlobalTileDistribution_A()); + auto g_win = + make_tile_window_linear(g_window_, + Policy::template MakeGlobalTileDistribution_G(), + sequence<0, 1, 1>{}); + auto d_win = + make_tile_window_linear(d_window_, + Policy::template MakeGlobalTileDistribution_D(), + sequence<0, 1, 1>{}); + auto o_win = make_tile_window_linear( + o_window_, Policy::template MakeGlobalTileDistribution_O()); + + using g_thread_type = decltype(load_tile(g_win)); + using d_thread_type = decltype(load_tile(d_win)); + + using WarpGemm0 = decltype(Policy::template GetWarpGemm0()); + using WarpGemm1 = decltype(Policy::template GetWarpGemm1()); + auto warp_gemm_0 = WarpGemm0{}; + auto warp_gemm_1 = WarpGemm1{}; + + // issues_warps_lanes + auto a_sst_win0 = + make_tile_window(make_tensor_view( + smem_0, Policy::template MakeLdsStoreDesc_A()), + Policy::template MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + + auto a_sst_win1 = + make_tile_window(make_tensor_view( + smem_1, Policy::template MakeLdsStoreDesc_A()), + Policy::template MakeLdsStoreDesc_A().get_lengths(), + {0, 0, 0}); + // m*k + auto a_sld_win0 = [&]() { + using WG = WarpGemm0; + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_outer_dstr_enc, typename WG::AWarpDstrEncoding{}); + return make_tile_window_linear( + make_tensor_view( + smem_0, Policy::template MakeLdsLoadDesc_A()), + Policy::template MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + // m*k + auto a_sld_win1 = [&]() { + using WG = WarpGemm0; + constexpr auto a_outer_dstr_enc = tile_distribution_encoding< + sequence<>, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + a_outer_dstr_enc, typename WG::AWarpDstrEncoding{}); + return make_tile_window_linear( + make_tensor_view( + smem_1, Policy::template MakeLdsLoadDesc_A()), + Policy::template MakeLdsLoadDesc_A().get_lengths(), + {0, 0}, + make_static_tile_distribution(a_block_dstr_encode)); + }(); + + auto bridge_sst_win = [&]() { + return make_tile_window( + make_tensor_view( + reinterpret_cast(smem), + Policy::template MakeBridgeLdsStoreDesc()), + Policy::template MakeBridgeLdsStoreDesc().get_lengths(), + {0, 0}); + }(); + + auto bridge_sld_win = [&]() { + return make_tile_window_linear( + make_tensor_view( + reinterpret_cast(smem), + Policy::template MakeBridgeLdsLoadDesc()), + Policy::template MakeBridgeLdsLoadDesc().get_lengths(), + {0, 0}, + Policy::template MakeYTileDistribution()); + }(); + + // also OK with C array, 2 register buffer + statically_indexed_array gs; + + constexpr auto issues_a = number{}; + constexpr auto issues_g = number{}; + // constexpr auto issues_d = number{}; + // constexpr auto issues_o = number{}; + constexpr auto issues_gemm0 = + number{}; + constexpr auto issues_gemm1 = + number{}; + // constexpr auto issues_sld_a = number{}; + + const index_t num_blocks_k0 = + (hidden_size + BlockShape::Block_K0 - 1) / BlockShape::Block_K0; + const index_t num_blocks_n1 = + (hidden_size + BlockShape::Block_N1 - 1) / BlockShape::Block_N1; + + using a_thread_type = decltype(load_tile(a_sld_win0)); + statically_indexed_array as; + + auto gld_a = [&]>( + auto& a_store_, auto i_access, PreNop = {}) + { + async_load_tile_raw(a_store_, a_win, i_access, PreNop{}); + }; + auto move_a = [&]() { + move_tile_window(a_win, {number<0>{}, number{}}); + }; + auto sld_a = [&](auto& a_, auto& win_, auto i_access) { + load_tile_raw(a_, win_, i_access); + }; + + auto gld_g = [&]>( + auto& g_, auto i_access, PreNop = {}) + { + if constexpr(IsGateOnly) + { + // TODO: hack! + if constexpr(i_access.value == 0) + { + g_win.bottom_tensor_view_ = g_view; + } + else if constexpr(i_access.value == issues_g / 2) + { + g_win.bottom_tensor_view_ = u_view; + } + } + load_tile_raw(g_, g_win, i_access, FALSE, PreNop{}); + }; + auto move_g = [&]() { + move_tile_window(g_win, {number<0>{}, number{}, number<0>{}}); + }; + statically_indexed_array ds; + + auto gld_d = [&]>( + auto& d_, auto i_access, PreNop = {}) + { + load_tile_raw(d_, d_win, i_access, FALSE, PreNop{}); + }; + auto move_d = [&]() { + // d move along gemm-n + move_tile_window(d_win, {number{}, number<0>{}}); + }; + + auto atomic_add_o = [&]>( + auto& o_, auto i_access, PreNop = {}) + { + update_tile_raw(o_win, o_, i_access, TRUE, PreNop{}); + }; + + auto acc_0 = Policy::template MakeCBlockTile_Gemm0(); + auto acc_1s = generate_tuple( + [&](auto) { return Policy::template MakeCBlockTile_Gemm1(); }, number<2>{}); + + // clang-format off + auto gemm_0 = [&]> + (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) { + using WarpGemm = remove_cvref_t; + + constexpr auto repeat_sub = WarpGemm::get_num_of_access(); + constexpr auto repeat_m = BlockShape::Repeat_M0; + // constexpr auto repeat_n = BlockShape::Repeat_N0; + constexpr auto repeat_k = BlockShape::Repeat_K0; + // loop order n->m->k + constexpr auto i_sub = i_access % repeat_sub; + constexpr auto i_k = (i_access / repeat_sub) % repeat_k; + constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m; + constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m; + + using AWarpTensor = typename WarpGemm::AWarpTensor; + using BWarpTensor = typename WarpGemm::BWarpTensor; + using CWarpTensor = typename WarpGemm::CWarpTensor; + using AWarpDstr = typename WarpGemm::AWarpDstr; + using BWarpDstr = typename WarpGemm::BWarpDstr; + using CWarpDstr = typename WarpGemm::CWarpDstr; + + constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + + AWarpTensor w_a; + w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data( + merge_sequences(sequence{}, a_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, a_warp_y_lengths)); + + BWarpTensor w_b; + w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data( + merge_sequences(sequence{}, b_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)); + + CWarpTensor w_c; + w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + warp_gemm_0(w_c, w_a, w_b, number{}, PostNop{}); + + t_c.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + w_c.get_thread_buffer()); + }; + // clang-format on + + // clang-format off + auto gemm_1 = [&]> + (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) { + using WarpGemm = remove_cvref_t; + + constexpr auto repeat_sub = WarpGemm::get_num_of_access(); + constexpr auto repeat_m = BlockShape::Repeat_M0; + // constexpr auto repeat_n = BlockShape::Repeat_N0; + constexpr auto repeat_k = BlockShape::Repeat_K0; + // loop order n->m->k + constexpr auto i_sub = i_access % repeat_sub; + constexpr auto i_k = (i_access / repeat_sub) % repeat_k; + constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m; + constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m; + + using AWarpTensor = typename WarpGemm::AWarpTensor; + using BWarpTensor = typename WarpGemm::BWarpTensor; + using CWarpTensor = typename WarpGemm::CWarpTensor; + using AWarpDstr = typename WarpGemm::AWarpDstr; + using BWarpDstr = typename WarpGemm::BWarpDstr; + using CWarpDstr = typename WarpGemm::CWarpDstr; + + constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t{}; + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + + AWarpTensor w_a; + w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data( + merge_sequences(sequence{}, a_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, a_warp_y_lengths)); + + BWarpTensor w_b; + w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data( + merge_sequences(sequence{}, b_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)); + + CWarpTensor w_c; + w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + warp_gemm_1(w_c, w_a, w_b, number{}, PostNop{}); + + t_c.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + w_c.get_thread_buffer()); + }; + // clang-format on + _Pragma("clang diagnostic pop"); + + // this gemm pipeline is designed with assumption that issues of buffer-load/ds_read can + // be hide under mfma. In other words, issues of mfma is >= memory this is true if we + // pre-shuffle B matrix, and A matrix is relatively small we prefer use multiple mfma + // paired with 1 buffer-load B matrix, to get max throughput of buffer_load. and by + // preshuffle, we always pack to dwordx4 load, and this will already extend to multiple + // mfma but that is already consumed inside warpgemm-impl. So indeed how many extra + // mfma(that can reuse the B matrix) only affected by M repeat. + auto pipeline_gemm0 = [&]() { + constexpr index_t total_loops = issues_gemm0; + constexpr auto sr = Policy::template GetSequencer_0(); + static_assert(sr.size() == total_loops); + + constexpr auto c_sld_a_0 = MAKE_SC(); + constexpr auto c_gld_a_0 = MAKE_SC(); + constexpr auto c_gld_b_0 = MAKE_SC(); + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I0], gs[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & SLD_A) + sld_a(as[I1], a_sld_win1, number{}); + if constexpr(slot & GLD_A) + gld_a(a_sst_win0, number{}); + if constexpr(slot & GLD_B) + gld_g(gs[I0], number{}); + }); + move_g(); + move_a(); + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + + constexpr auto c_sld_a_1 = MAKE_SC(); + constexpr auto c_gld_a_1 = MAKE_SC(); + constexpr auto c_gld_b_1 = MAKE_SC(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I1], gs[I1], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & SLD_A) + sld_a(as[I0], a_sld_win0, number{}); + if constexpr(slot & GLD_A) + gld_a(a_sst_win1, number{}); + if constexpr(slot & GLD_B) + gld_g(gs[I1], number{}); + }); + move_g(); + move_a(); + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + }; + + auto pipeline_gemm0_tail = [&]() { + constexpr index_t total_loops = issues_gemm0; + constexpr auto sr = Policy::template GetSequencer_0(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_0(acc_0, as[I0], gs[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + + if constexpr(slot & GLD_B) + gld_g(gs[I1], number{}); + }); + + block_sync_load_raw(issues_g); + sld_a(as[I1], a_sld_win1, NEG1); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + constexpr auto last_nop = [&]() { + if constexpr(i_issue == (total_loops - 1)) + return TRUE; + else + return FALSE; + }(); + gemm_0(acc_0, as[I1], gs[I1], i_issue, last_nop); // last gemm has nop + }); + }; + + auto y = Policy::template MakeYBlockTile(); + + auto pipeline_bridge = [&]() { + // cast to Y data + auto y_pre = cast_tile(acc_0); + store_tile(bridge_sst_win, y_pre); + clear_tile(acc_1s(I0)); + // wave_barrier(); + load_tile(y, bridge_sld_win); + clear_tile(acc_1s(I1)); + }; + + // note, gemm-1 start from idx-1 to N-2 (0, 1, 2....N-1) + auto pipeline_gemm1 = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + constexpr auto c_gst_o_0 = MAKE_SC(); + constexpr auto c_gld_b_1 = MAKE_SC(); + constexpr auto c_gst_o_1 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I1], y, ds[I1], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I0], number{}); + + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I0]); + atomic_add_o(out, number{}); + } + }); + move_d(); + // move_o(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I0], y, ds[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I1], number{}); + + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I1]); + atomic_add_o(out, number{}); + } + }); + move_d(); + }; + + auto pipeline_gemm1_head = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gld_b_0 = MAKE_SC(); + + // compute buffer 0 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I0], y, ds[I0], i_issue); + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GLD_B) + gld_d(ds[I1], number{}); + }); + move_d(); + }; + auto pipeline_gemm1_tail = [&]() { + constexpr index_t total_loops = issues_gemm1; + constexpr auto sr = Policy::template GetSequencer_1(); + static_assert(sr.size() == total_loops); + + constexpr auto c_gst_o_0 = MAKE_SC(); + + // compute buffer 1 + static_for<0, total_loops, 1>{}([&](auto i_issue) { + gemm_1(acc_1s[I1], y, ds[I1], i_issue); + + constexpr index_t slot = sr.at(i_issue); + if constexpr(slot & GST_O) + { + auto out = cast_tile(acc_1s[I0]); + atomic_add_o(out, number{}); + } + }); + { + auto out = cast_tile(acc_1s[I1]); + atomic_add_o(out, NEG1); + } + }; + + // start of pipeline + // clang-format off + gld_a(a_sst_win0, NEG1, TRUE); + gld_g(gs[I0], NEG1, TRUE); + move_a(); + move_g(); + clear_tile(acc_0); + + // preload for next round + gld_a(a_sst_win1, NEG1); + gld_g(gs[I1], NEG1); + + // make sure a,g loaded + block_sync_load_raw(issues_a + issues_g); + lds_load_fence(); + + // we manually unroll double buffer inside hot loop + const index_t iters_0 = (num_blocks_k0 - 2) / 2; + index_t i_0 = 0; // (void)i_0; (void)iters_0; (void)pipeline_gemm0; + while(i_0++ < iters_0) + { + pipeline_gemm0(); + } + pipeline_gemm0_tail(); + + pipeline_bridge(); + + const index_t iters_1 = (num_blocks_n1 - 2) / 2; + index_t i_1 = 0; // (void) i_1; (void)iters_1; (void)pipeline_gemm1; + pipeline_gemm1_head(); + while(i_1++ < iters_1) + { + pipeline_gemm1(); + } + pipeline_gemm1_tail(); + // clang-format on + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp new file mode 100644 index 0000000000..fea30f0297 --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp" +#include "ck_tile/ops/flatmm.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" + +namespace ck_tile { + +struct FusedMoeGemmPipelineFlatmmPolicy +{ + CK_TILE_HOST_DEVICE static constexpr index_t GetAsyncCopyDwords() + { + // TODO: always 1 dword + return 1; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_A() + { + // using async + constexpr index_t copy_bytes = 4 * GetAsyncCopyDwords(); + constexpr index_t data_bytes = sizeof(typename Problem::ADataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_G() + { + constexpr index_t copy_bytes = [&]() { return 16; }(); + constexpr index_t data_bytes = sizeof(typename Problem::GDataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_D() + { + constexpr index_t copy_bytes = [&]() { return 16; }(); + constexpr index_t data_bytes = sizeof(typename Problem::DDataType); + static_assert(copy_bytes % data_bytes == 0); + return copy_bytes / data_bytes; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_O() + { + if constexpr(Problem::Traits::OAtomic == 1) + { + // pack fp16/bf16 atomic + static_assert(sizeof(typename Problem::ODataType) == 2); + return 2; + } + else if constexpr(Problem::Traits::OAtomic == 2) + { + // fp32 atomic + return 1; + } + else + { + return 16 / sizeof(typename Problem::ODataType); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack() + { + // TODO: this is for 3d layout + return 16 / sizeof(remove_cvref_t); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_A() + { + return GetSmemKPack(); + } + + // used for bridge LDS shuffle + template + CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_Y() + { + // TODO: this should match mfma layout + return 16 / sizeof(typename Problem::YDataType); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A() + { + constexpr auto a_sld_desc = MakeLdsLoadDesc_A(); + constexpr auto a_sst_desc = MakeLdsStoreDesc_A(); + static_assert(a_sld_desc.get_element_space_size() == a_sst_desc.get_element_space_size()); + return a_sld_desc.get_element_space_size(); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_Bridge() + { + constexpr auto bridge_sld_desc = MakeBridgeLdsLoadDesc(); + constexpr auto bridge_sst_desc = MakeBridgeLdsStoreDesc(); + static_assert(bridge_sld_desc.get_element_space_size() == + bridge_sst_desc.get_element_space_size()); + return bridge_sld_desc.get_element_space_size(); + } + + template + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + constexpr index_t a_lds = GetSmemSize_A(); + constexpr index_t bridge_lds = GetSmemSize_Bridge(); + return max(a_lds, bridge_lds); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK() + { + constexpr index_t K_vec = Alignment; + constexpr index_t K_rem = KPerBlock / K_vec; + + if constexpr(get_warp_size() < K_rem) + { + static_assert(K_rem % get_warp_size() == 0); + constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k + constexpr index_t K_wav = K_rem / get_warp_size(); + static_assert(K_wav <= NumWarps, "not not support thread has repeat along K yet"); + constexpr index_t M_wav = NumWarps / K_wav; + static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / M_wav; + + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<2>>, + tuple, sequence<1>>, + sequence<1, 2>, + sequence<0, 2>>{}); + } + else + { + constexpr index_t K_lan = K_rem; + constexpr index_t M_lan = get_warp_size() / K_lan; + constexpr index_t M_wav = NumWarps; + static_assert(MPerBlock % (M_lan * M_wav) == 0, + "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / (M_lan * M_wav); + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + } + + // optimized version for async, not same as simple MXK dist(pay attention!!) + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK_Async() + { + constexpr index_t K_vec = Alignment; + constexpr index_t K_rem = KPerBlock / K_vec; + + if constexpr(get_warp_size() <= K_rem) + { + static_assert(K_rem % get_warp_size() == 0); + constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k + constexpr index_t K_wav = K_rem / get_warp_size(); + static_assert(K_wav <= NumWarps, "do not support thread has repeat along K yet"); + constexpr index_t M_wav = NumWarps / K_wav; + static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / M_wav; + // NOTE: no swap, but hard to avoid LDS bank conflict + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, sequence>, + tuple, sequence<2>>, + tuple, sequence<1>>, + sequence<1, 2>, + sequence<0, 2>>{}); + } + else + { + constexpr index_t K_lan = K_rem; + constexpr index_t M_lan = get_warp_size() / K_lan; + constexpr index_t M_wav = NumWarps; + static_assert(MPerBlock % (M_lan * M_wav) == 0, + "this tile size is too small please check"); + constexpr index_t M_rep = MPerBlock / (M_lan * M_wav); + // NOTE: swapped for LDS load bank conflict free + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + // Note M_wave(num waves) is the fastest dim, different from sipmle 2d + // distribution + tuple, sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<1, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_Nr_Kr_W() + { + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, + sequence, + sequence>, + tuple, sequence<3>>, + tuple, sequence<0>>, + sequence<1, 2, 3>, + sequence<0, 0, 1>>{}); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_A() + { + constexpr index_t Block_M_ = Problem::BlockShape::Block_M0; + constexpr index_t Block_K_ = Problem::BlockShape::Block_K0; + constexpr index_t NumWarps_ = Problem::BlockShape::NumWarps; + constexpr index_t Alignment_ = GetAlignment_A(); + return MakeGlobalTileDistribution_SimpleMxK_Async(); + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_G() + { + constexpr auto PermuteEnum = Problem::Traits::PermuteEnum; + // constexpr index_t hidden_radio_0 = Problem::Traits::IsGateOnly ? 1 : 2; + using S_ = typename Problem::BlockShape; + if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten) + { + // number{}.rrr(); + // number{}.eee(); + return MakeGlobalTileDistribution_Nr_Kr_W()>(); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_D() + { + constexpr auto PermuteEnum = Problem::Traits::PermuteEnum; + using S_ = typename Problem::BlockShape; + if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten) + { + return MakeGlobalTileDistribution_Nr_Kr_W()>(); + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_O() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + // using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + return c_block_dstr; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A() + { + // A async->LDS + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack = GetSmemKPack_A(); // LDS + constexpr index_t KVector = GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK > NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_pass_through_transform(number{}), + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_pass_through_transform(number{}), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})); + + return lds_block_desc_issues_warps_lanes; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A() + { + // A async->LDS + // Note that, this descriptor is only to construct the layout inside LDS + // in real Gemm pipeline, ds_read may not follow this pattern + // (may follow that in tile_distribution) + // below code is almost the same as SmemStore dist, with difference: + // 1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc + // 2). return discriptor is in NxK 2d layout + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_K = Problem::BlockShape::Block_K0; + // constexpr index_t BlockSize = Problem::BlockShape::BlockSize; + constexpr index_t warpSize = ck_tile::get_warp_size(); + constexpr index_t NumWarps = Problem::BlockShape::NumWarps; + + constexpr index_t KPack = GetSmemKPack_A(); // LDS + constexpr index_t KVector = GetAlignment_A(); // async copy 1 dword + constexpr index_t KPad = KPack; // pad between warps + + static_assert(Block_K % KVector == 0); + constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K + if constexpr(LanesPerK >= warpSize) + { + // need multiple waves to load K + static_assert(LanesPerK % warpSize == 0); + constexpr index_t wavesPerK = LanesPerK / warpSize; + if constexpr(wavesPerK >= NumWarps) + { + // TODO: need multiple issues along K to load all data + } + else + { + constexpr index_t wavesPerM = NumWarps / wavesPerK; + constexpr index_t NumIssues = Block_M / wavesPerM; + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number{}), // k2 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // k0 + number{}, // k1 + number<1>{}), // k2 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple( + number{}, number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + } + else + { + // lanes within a wave load different M but same K + static_assert(warpSize % LanesPerK == 0); + constexpr index_t LaneGroups = warpSize / LanesPerK; // along m + constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number{}), // k1 + make_tuple(number{}, // m0 + number{}, // m1 + number{}, // m2 + number{}, // k0 + number<1>{}), // k1 + number{}, // lds load vector + number<1>{}); + + constexpr auto lds_desc_m_k = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple( + make_merge_transform( + make_tuple(number{}, number{}, number{})), + make_merge_transform(make_tuple(number{}, number{}))), + make_tuple(sequence<0, 1, 2>{}, sequence<3, 4>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_desc_m_k; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsLoadDesc() + { + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_N = Problem::BlockShape::Block_N0; + + constexpr index_t KVector = GetSmemKPack_Y(); // async copy 1 dword + constexpr index_t KPad = 0; // pad between warps + + constexpr auto desc = + make_naive_tensor_descriptor(make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreDesc() + { + constexpr index_t Block_M = Problem::BlockShape::Block_M0; + constexpr index_t Block_N = Problem::BlockShape::Block_N0; + + constexpr index_t KVector = GetSmemKPack_Y(); // async copy 1 dword + constexpr index_t KPad = 0; // KVector; // pad between warps + + constexpr auto desc = + make_naive_tensor_descriptor(make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc() + { + constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0; + constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N0; + constexpr index_t Repeat_M = Problem::BlockShape::Repeat_M0; + + constexpr index_t kAMLane = 16; + constexpr index_t kABKLane = 4; + constexpr index_t kABKPerLane = 4; + + constexpr index_t KPack = kABKPerLane; + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, // m + number{}, // n + number{}, // n + number{}, // n + number{}, // m + number{}), // n + make_tuple(number{}, // m + number{}, // n + number{}, // n + number{}, // n + number{}, // m + number<1>{}), // n + number{}, // lds store vector(actually no explicit store) + number<1>{}); + + constexpr auto desc = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_merge_transform(make_tuple(number{}, number{})), + make_merge_transform(make_tuple(number{}, + number{}, + number{}, + number{}))), + make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return desc; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0() + { + using S_ = typename Problem::BlockShape; + // A is vgpr, B is agpr. But since we transposed, so also need swap this + // TODO: this is ugly + constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv; + // TODO: ugly + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16) + { + return WarpGemmImpl, + 2>>{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32) + { + return WarpGemmImpl, + 2>>{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_0() + { + // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence + // the purpose is to hide thoes instructions under mfma + // every value inside seq<...> is a mask, indicating a specific operation + using S_ = typename Problem::BlockShape; + constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 7 + return seq_all; + // clang-format on + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 32 instructions, 16 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 3 + return seq_all; + // clang-format on + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_1() + { + // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence + // the purpose is to hide thoes instructions under mfma + // every value inside seq<...> is a mask, indicating a specific operation + using S_ = typename Problem::BlockShape; + constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 7 + return seq_all; + // clang-format on + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 && + S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 && + S_::Block_N1 == 128) + { + // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async + // gld_a 8x ds_read_b128 sld_a total 64 slot :) + // clang-format off + constexpr auto seq_all = + // 0 1 2 3 4 5 6 7 + sequence{}; // 3 + return seq_all; + // clang-format on + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm1() + { + using S_ = typename Problem::BlockShape; + constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv; + // TODO: ugly + if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16) + { + return WarpGemmImpl, + 2>>{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32) + { + return WarpGemmImpl, + 2>>{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm0() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm1() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + using CDataType = typename WarpGemm::CDataType; + + constexpr auto c_block_outer_dstr_encoding = + tile_distribution_encoding, + tuple, + sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + return c_block_tensor; + } + + // this is used as A matrix for 2nd gemm + template + CK_TILE_HOST_DEVICE static constexpr auto MakeYTileDistribution() + { + using S_ = remove_cvref_t; + using WarpGemm = remove_cvref_t())>; + + // TODO: all waves a along different N, but same M + constexpr auto y_outer_dstr_enc = + tile_distribution_encoding, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto y_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + y_outer_dstr_enc, typename WarpGemm::AWarpDstrEncoding{}); + constexpr auto y_block_dstr = make_static_tile_distribution(y_block_dstr_encode); + return y_block_dstr; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto MakeYBlockTile() + { + constexpr auto y_block_dstr = MakeYTileDistribution(); + auto y_block_tensor = + make_static_distributed_tensor(y_block_dstr); + return y_block_tensor; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetUK_0() + { + using S_ = typename Problem::BlockShape; + if constexpr(std::is_same_v && + std::is_same_v && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return Flatmm_32x512x128_1x4x1_16x16x32_BF16{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return Flatmm_32x512x128_1x4x1_16x16x32_FP16{}; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetUK_1() + { + using S_ = typename Problem::BlockShape; + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{}; + } + else if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 && + S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32) + { + return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{}; + } + } +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp new file mode 100644 index 0000000000..a6f71eafac --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" + +namespace ck_tile { + +/* +This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight) +we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave) + + <----- gemm-N ------> + +----+----+----+----+ + | w0 | w1 | w2 | w3 | gemm-m + +----+----+----+----+ +*/ +template +struct FusedMoeGemmPipeline_FlatmmUk +{ + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + + using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape + + using ADataType = typename Problem::ADataType; + using GDataType = typename Problem::GDataType; + using DDataType = typename Problem::DDataType; + using AccDataType = typename Problem::AccDataType; + using ODataType = typename Problem::ODataType; + using AScaleDataType = typename Problem::AScaleDataType; + using GScaleDataType = typename Problem::GScaleDataType; + using DScaleDataType = typename Problem::DScaleDataType; + using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType; + using TopkWeightDataType = typename Problem::TopkWeightDataType; + using IndexDataType = typename Problem::IndexDataType; + using YDataType = typename Problem::YDataType; + + using Traits = typename Problem::Traits; + + static constexpr bool IsGateOnly = Traits::IsGateOnly; + static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant; + static constexpr bool PadHiddenSize = Traits::PadHiddenSize; + static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize; + + static constexpr index_t kAlignmentA = Policy::template GetAlignment_A(); + static constexpr index_t kAlignmentG = Policy::template GetAlignment_G(); + static constexpr index_t kAlignmentD = Policy::template GetAlignment_D(); + static constexpr index_t kAlignmentO = Policy::template GetAlignment_O(); + + static constexpr index_t SLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::SLD_A); + static constexpr index_t GLD_A = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_A); + static constexpr index_t GLD_B = static_cast(FusedMoeGemmPipelineSequencerEnum::GLD_B); + static constexpr index_t GST_O = static_cast(FusedMoeGemmPipelineSequencerEnum::GST_O); + + static constexpr index_t kBlockPerCu = []() { + if constexpr(Problem::kBlockPerCu != -1) + return Problem::kBlockPerCu; + else + { + // minimize occupancy + return 2; + } + }(); + + static constexpr const char* name = "flatmm_uk"; + + CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() + { + constexpr index_t smem_0 = Policy::template GetUK_0().GetSmemSize(); + constexpr index_t smem_1 = Policy::template GetUK_1().GetSmemSize(); + constexpr index_t smem_bridge = + BlockShape::Block_M0 * BlockShape::Block_N0 * sizeof(YDataType); + return max(smem_0, max(smem_1, smem_bridge)); + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetACoord() + { + constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A(); + const auto a_coord = a_dist.calculate_index(); + return a_coord; + } + + // this is the thread-offset along row/col + CK_TILE_HOST_DEVICE static auto GetOCoord() + { + constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution(); + const auto o_coord = o_dist.calculate_index(); + return o_coord; + } + + CK_TILE_DEVICE constexpr auto GetNumRowCoords_A() + { + constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA; + constexpr index_t MLans = BlockShape::BlockSize / KLans; + constexpr index_t MRepeat = BlockShape::Block_M0 / MLans; + + return MRepeat; + } + + // TODO: properlly support scatter/gather + CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset) + { + constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA; + constexpr index_t MLans = BlockShape::BlockSize / KLans; + constexpr index_t MRepeat = BlockShape::Block_M0 / MLans; + + auto base_coord = threadIdx.x / KLans + base_offset; + + array coords; + static_for<0, MRepeat, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLans; }); + + return coords; + } + + template + CK_TILE_DEVICE auto GetRowID(const ROW_COORDS coords, const IndexDataType* sorted_token_ids_ptr) + { + constexpr index_t n_size = coords.size(); + + array row_ids; + static_for<0, n_size, 1>{}([&](auto i) { + row_ids.at(i) = sorted_token_ids_ptr[coords[i]]; // base_coord + i * MLans; + }); + + return row_ids; + } + + template + CK_TILE_DEVICE auto GetWeightScale(const ROW_COORDS coords, + const TopkWeightDataType* sorted_weight_ptr) + { + constexpr index_t n_size = coords.size(); + + array w; + static_for<0, n_size, 1>{}([&](auto i) { + w.at(i) = sorted_weight_ptr[coords[i]]; // base_coord + i * MLans; + }); + + return w; + } + + // TODO: this row id is before shuffle atomic, need use acc distribution + CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset) + { + constexpr index_t MLanes = BlockShape::Warp_M1; + constexpr index_t Repeat_M = BlockShape::Repeat_M1; + + auto base_coord = threadIdx.x % MLanes + base_offset; + + array coords; + static_for<0, Repeat_M, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLanes; }); + + return coords; + } + + template + CK_TILE_DEVICE auto operator()(const Karg& kargs, + CK_TILE_LDS_ADDR void* smem, + index_t sorted_tile_id, + index_t intermediate_tile_id) + { + constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; + ck_tile::index_t shared_intermediate_size_0 = kargs.intermediate_size; + ck_tile::index_t shared_intermediate_size_1 = kargs.intermediate_size / hidden_radio_0; + + index_t nr_0 = shared_intermediate_size_0 / BlockShape::Warp_N0; // divide N in W + index_t kr_0 = kargs.hidden_size / BlockShape::Warp_K0; // divide K in W + index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1; + index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1; + + const IndexDataType expert_id = __builtin_amdgcn_readfirstlane( + reinterpret_cast(kargs.sorted_expert_ids_ptr)[sorted_tile_id]); + index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size; + index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size; + + // nr*kr*w + index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane( + intermediate_tile_id * + BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W) + + index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane( + intermediate_tile_id * + BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W) + + auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0); + auto row_ids_a = GetRowID( + row_coords_a, reinterpret_cast(kargs.sorted_token_ids_ptr)); + auto a_coords = generate_tuple( + [&](auto i) { + return row_ids_a[i] * kargs.stride_token + + threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA; + }, + number{}); + auto a_res = + make_wave_buffer_resource(reinterpret_cast(kargs.a_ptr), + kargs.num_tokens * kargs.stride_token * sizeof(ADataType)); + + auto g_win = [&]() { + const GDataType* g_ptr = reinterpret_cast(kargs.g_ptr) + + static_cast(expert_id) * expert_stride_0 + + interm_idx_nr0 * kr_0 * BlockShape::Block_W0; + auto g_view_ = make_naive_tensor_view( + g_ptr, + make_tuple(nr_0, kr_0, number{}), + make_tuple(kr_0 * BlockShape::Block_W0, number{}, 1), + number{}, + number<1>{}); + + auto g_window_ = make_tile_window_linear_raw( + g_view_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}, + Policy::template MakeGlobalTileDistribution_G(), + sequence<0, 1, 1>{}); + return g_window_; + }(); + + auto g_res = g_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_; + auto g_coords = generate_tuple([&](auto i) { return g_win.cached_coords_[i].get_offset(); }, + number{}); + + const auto d_win = [&]() { + const DDataType* d_ptr = reinterpret_cast(kargs.d_ptr) + + static_cast(expert_id) * expert_stride_1 + + interm_idx_kr1 * BlockShape::Block_W1; + // note interm_idx_nr0 is along the gemm-k dim of 2nd gemm + + const auto d_view_ = make_naive_tensor_view( + d_ptr, + make_tuple(nr_1, kr_1, BlockShape::Block_W1), + make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1), + number{}, + number<1>{}); + + const auto d_window_ = make_tile_window_linear_raw( + d_view_, + make_tuple(number{}, + number{}, + number{}), + {0, 0, 0}, + Policy::template MakeGlobalTileDistribution_D(), + sequence<0, 1, 1>{}); + return d_window_; + }(); + auto d_res = d_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_; + + // TODO: load D order is N0.K0...127, N64.K0...127, N0.K128...255, N64.K128...255 + // block-k=512, block-n=128 + // wg |<----- W_ ----->| + // Nr(2)*Nw(4)* Kr *Kr0(4)*Kr1(4) * [Kl(4)*Nl(16)*Kv(8)]->one issue + // y p y y p p y + // 1 2 0(imm) + auto d_coords = [&]() { + constexpr index_t Nr_ = 2; + constexpr index_t Nw_ = 4; + constexpr index_t Kr0_ = 4; + constexpr index_t Kr1_ = 4; + constexpr index_t Kl_ = 4; + constexpr index_t Nl_ = 16; + constexpr index_t Kv_ = 8; + constexpr index_t W_ = Kl_ * Nl_ * Kv_; + constexpr index_t num_offsets_ = Nr_ * Kr0_; + index_t base_os_ = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) * + shared_intermediate_size_1 * + Nl_; // Kr0_ * Kr1_ * W_; + return generate_tuple( + [&](auto i) { + constexpr auto i_nr_ = number{}; + constexpr auto i_kr0_ = number{}; + + return i_nr_ * shared_intermediate_size_1 * Nw_ * Nl_ + i_kr0_ * Kr1_ * W_ + + base_os_; + }, + number{}); + }(); + + auto o_coords = generate_tuple( + [&](auto i) { + return row_ids_a[i] * kargs.stride_token + + threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO; + }, + number{}); + + auto o_flags = + generate_tuple([&](auto i) { return cmp_lt_to_exec(row_ids_a[i], kargs.num_tokens); }, + number{}); + + auto bridge_sst_win = [&]() { + constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc(); + constexpr auto dist_ = Policy::template GetUK_0().MakeCBlockDist(); + return make_tile_window_linear(make_tensor_view( + reinterpret_cast(smem), desc_), + desc_.get_lengths(), + {0, 0}, + dist_); + }(); + auto o_res = + make_wave_buffer_resource(reinterpret_cast(kargs.o_ptr), + kargs.num_tokens * kargs.stride_token * sizeof(ODataType)); + + auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0); + auto w_scale = GetWeightScale( + row_coords_o, reinterpret_cast(kargs.sorted_weight_ptr)); + + auto uk_0 = Policy::template GetUK_0(); + auto acc_0 = uk_0(a_res, + a_coords, + g_res, + g_coords, + smem, + kargs.hidden_size, + BlockShape::Block_K0, // tile offset for B matrix each unroll + BlockShape::Block_Kr0 * + BlockShape::Block_W0); // tile offset for B matrix each unroll + + sweep_tile( + acc_0, + [&](auto idx0, auto idx1) { + fp32x2_t v_{acc_0(idx0), acc_0(idx1)}; + typename Problem::GateActivation{}(v_, v_); + acc_0(idx0) = v_.x; + acc_0(idx1) = v_.y; + }, + sequence<1, 2>{}); + + auto y_pre = cast_tile(acc_0); + + block_sync_lds(); + + store_tile(bridge_sst_win, y_pre); + block_sync_lds(); + + auto uk_1 = Policy::template GetUK_1(); + uk_1(d_res, + d_coords, + o_res, + o_coords, + o_flags, + smem, + kargs.hidden_size, // total n number + w_scale, + BlockShape::Block_Nr1 * kr_1 * BlockShape::Block_W1, // along N + BlockShape::Block_N1); // along N + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp new file mode 100644 index 0000000000..6089c2558f --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +// TODO: alow 2 gemm have different type +template +struct FusedMoeGemmPipelineProblem +{ + using ADataType = remove_cvref_t; + using GDataType = remove_cvref_t; + using DDataType = remove_cvref_t; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using AScaleDataType = remove_cvref_t; + using GScaleDataType = remove_cvref_t; + using DScaleDataType = remove_cvref_t; + using YSmoothScaleDataType = remove_cvref_t; + using TopkWeightDataType = remove_cvref_t; + using IndexDataType = remove_cvref_t; + + // the input for next gemm should have same time as + using YDataType = ADataType; + + using GateActivation = remove_cvref_t; + using BlockShape = remove_cvref_t; + using Traits = remove_cvref_t; +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp new file mode 100644 index 0000000000..d7127b098c --- /dev/null +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +enum class FusedMoeGemmWeightPermuteEnum +{ + // permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6 + // permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6 + no_permute = 0, + b_nr_kr_kw_nw_kv = 1, // 0,1,3,4,2,5 + b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv, +}; + +template +struct FusedMoeGemmTraits +{ + // Gate+Up or Gate only + static constexpr bool IsGateOnly = IsGateOnly_; + static constexpr bool UseSmoothQuant = UseSmoothQuant_; + static constexpr index_t OAtomic = OAtomic_; + static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_; + static constexpr bool PadHiddenSize = PadHiddenSize_; + static constexpr bool PadIntermediateSize = PadIntermediateSize_; +}; + +// Note: this need to be a bit mask +enum class FusedMoeGemmPipelineSequencerEnum +{ + SLD_A = 1 << 0, // shared load a + SLD_B = 1 << 1, + GLD_A = 1 << 2, // global load a + GLD_B = 1 << 3, + SST_A = 1 << 4, // shared store a + SST_B = 1 << 5, + GST_O = 1 << 6, // global store out +}; +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp index 7ca4a697a7..89ea82c5bd 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp @@ -10,114 +10,134 @@ namespace ck_tile { // fp16 -using WarpGemmMfmaF16F16F32M32N32K8 = - WarpGemmImpl>; -using WarpGemmMfmaF16F16F32M16N16K16 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaF16F16F32M32N32K16 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaF16F16F32M16N16K32 = - WarpGemmImpl>; +using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl, + 1>>; -using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl, + 2>>; -using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = + WarpGemmImpl>>; + +using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = + WarpGemmImpl>>; using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, 2>>; // bf16 -using WarpGemmMfmaBf16Bf16F32M32N32K8 = - WarpGemmImpl>; -using WarpGemmMfmaBf16Bf16F32M16N16K16 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaBf16Bf16F32M32N32K16 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfmaBf16Bf16F32M16N16K32 = - WarpGemmImpl>; +using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl< - WarpGemmAtrributeMfmaIterateK_SwizzleA>; +using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl, + 1>>; -using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = + WarpGemmImpl, + 2>>; -using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = + WarpGemmImpl>>; + +using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = + WarpGemmImpl>>; using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution = WarpGemmImpl, 2>>; using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, 2>>; // fp8 -using WarpGemmMfma_f32_32x32x16_fp8_fp8 = - WarpGemmImpl>; -using WarpGemmMfma_f32_32x32x16_fp8_bf8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_bf8_fp8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_bf8_bf8 = - WarpGemmImpl>; +using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl< + WarpGemmAtrributeMfma>>; -using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = + WarpGemmImpl>>; -using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = + WarpGemmImpl>>; -using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl< - WarpGemmAtrributeMfmaTransposedCDistribution>; +using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = + WarpGemmImpl>>; + +using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = + WarpGemmImpl>>; template using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl, + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base, 2, swizzle_factor>>; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp index d80e5198e6..0a8d2dfbe3 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp @@ -25,6 +25,8 @@ struct WarpGemmAtrributeMfma static constexpr index_t kN = Impl::kN; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -51,10 +53,13 @@ struct WarpGemmAtrributeMfma sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { - Impl{}(c_vec, a_vec, b_vec); + Impl{}(c_vec, a_vec, b_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -85,6 +90,8 @@ struct WarpGemmAtrributeMfmaIterateK static constexpr index_t kN = Impl::kN; static constexpr index_t kK = Impl::kK * kKIter; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -111,8 +118,11 @@ struct WarpGemmAtrributeMfmaIterateK sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -122,10 +132,33 @@ struct WarpGemmAtrributeMfmaIterateK reinterpret_cast(a_vec) .template get_as()[iKIter], reinterpret_cast(b_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(a_vec) + .template get_as()[iKIter], + reinterpret_cast(b_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -168,6 +201,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -194,11 +229,14 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution sequence<0, 2>>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { // swap A and B - Impl{}(c_vec, b_vec, a_vec); + Impl{}(c_vec, b_vec, a_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -226,6 +264,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -255,12 +295,15 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB sequence<2, 2>, sequence<0, 2>>; + template // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { // swap A and B - Impl{}(c_vec, b_vec, a_vec); + Impl{}(c_vec, b_vec, a_vec, bool_constant{}); } // c_vec = a_vec * b_vec @@ -291,6 +334,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution static constexpr index_t kN = Impl::kM; static constexpr index_t kK = Impl::kK * kKIter; + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -316,9 +361,12 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution sequence<2, 2>, sequence<0, 2>>; + template // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -328,10 +376,34 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution reinterpret_cast(b_vec) .template get_as()[iKIter], reinterpret_cast(a_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + // c_vec += a_vec * b_vec + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + // swap A and B, value and type + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(b_vec) + .template get_as()[iKIter], + reinterpret_cast(a_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -377,6 +449,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB static constexpr index_t kK = Impl::kK * kKIter; static constexpr index_t SFactor = SFactor_; // group how many CM1 together + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple, sequence>, @@ -429,8 +503,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB sequence<0, 2>>; #endif // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -440,10 +517,33 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB reinterpret_cast(b_vec) .template get_as()[iKIter], reinterpret_cast(a_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + // swap A and B, value and type + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(b_vec) + .template get_as()[iKIter], + reinterpret_cast(a_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { @@ -488,6 +588,8 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA static constexpr index_t kK = Impl::kK * kKIter; static constexpr index_t SFactor = SFactor_; // group how many CM1 together + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } + using AWarpDstrEncoding = tile_distribution_encoding< sequence<>, tuple>; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { using buf_a = thread_buffer; using buf_b = thread_buffer; @@ -529,10 +634,33 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA reinterpret_cast(a_vec) .template get_as()[iKIter], reinterpret_cast(b_vec) - .template get_as()[iKIter]); + .template get_as()[iKIter], + bool_constant{}); }); } + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + number, + bool_constant = {}) const + { + using buf_a = thread_buffer; + using buf_b = thread_buffer; + + static_assert(iKIter < kKIter); + + // static_for<0, kKIter, 1>{}([&](auto iKIter) { + Impl{}(c_vec, + reinterpret_cast(a_vec) + .template get_as()[iKIter], + reinterpret_cast(b_vec) + .template get_as()[iKIter], + bool_constant{}); + //}); + } + // c_vec = a_vec * b_vec CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const { diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp index bb59a72982..0aba1f5355 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -7,12 +7,68 @@ namespace ck_tile { +// TODO: refactor warp-gemm +// currently there is a discrepency for vav/vva if we need transpose C/D +// e.g. if we want A:agpr, B:vgpr, we have to use vva in WGAttrEnum +// because we swap the A/B pointer in _impl code (but not known this info here) +enum class WGAttrCtlEnum +{ + Default_ = 0, + Raw_vvv = 1, // c-vgpr, a-vgpr, b-vgpr + Raw_vaa = 2, // c-vgpr, a-agpr, b-agpr + Raw_vav = 3, // c-vgpr, a-agpr, b-vgpr + Raw_vva = 4, // c-vgpr, a-vgpr, b-agpr + Raw_avv = 5, // c-agpr, a-vgpr, b-vgpr + // raw_a_a_a = 3, // c-agpr, a-agpr, b-agpr +}; + +#define DISPATCH_MFMA_(mfma_, dmod_, amod_, bmod_, cmod_) \ + if constexpr(post_nop_) \ + { \ + asm volatile(mfma_ " %0, %1, %2, %3 ; yyy\n" \ + "s_nop 3" \ + : dmod_(c_vec) \ + : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \ + :); \ + } \ + else \ + { \ + asm volatile(mfma_ " %0, %1, %2, %3\n" \ + : dmod_(c_vec) \ + : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \ + :); \ + } + +#define DISPATCH_MFMA_CTRL_(mfma_, ctrl_) \ + if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vvv) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "v", "v", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vaa) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "a", "a", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vav) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "a", "v", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vva) \ + { \ + DISPATCH_MFMA_(mfma_, "+v", "v", "a", "v") \ + } \ + else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_avv) \ + { \ + DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a") \ + } + // FP16 +template struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 { - using ADataType = fp16_t; - using BDataType = fp16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -33,16 +89,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8f16", Ctrl) + else + { #if defined(__gfx9__) - c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -52,18 +115,20 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8 return bit_cast( __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0)); #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 { - using ADataType = fp16_t; - using BDataType = fp16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = fp16_t; + using BDataType = fp16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -84,16 +149,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16f16", Ctrl) + else + { #if defined(__gfx9__) - c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -103,19 +175,21 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16 return bit_cast( __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0)); #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; // Bf16 +template struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 { - using ADataType = bf16_t; - using BDataType = bf16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -136,28 +210,35 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8bf16_1k", Ctrl) + else + { #if defined(__gfx90a__) || defined(__gfx94__) - c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); #elif defined(__gfx908__) - static_for<0, 2, 1>{}([&](auto k) { - c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16( - reinterpret_cast&>(a_vec) - .template get_as>()[number{}], - reinterpret_cast&>(b_vec) - .template get_as>()[number{}], - c_vec, - 0, - 0, - 0); - }); + static_for<0, 2, 1>{}([&](auto k) { + c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16( + reinterpret_cast&>(a_vec) + .template get_as>()[number{}], + reinterpret_cast&>(b_vec) + .template get_as>()[number{}], + c_vec, + 0, + 0, + 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -181,18 +262,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8 }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 { - using ADataType = bf16_t; - using BDataType = bf16_t; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = bf16_t; + using BDataType = bf16_t; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -213,28 +296,34 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16bf16_1k", Ctrl) + { #if defined(__gfx90a__) || defined(__gfx94__) - c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0); #elif defined(__gfx908__) - static_for<0, 2, 1>{}([&](auto k) { - c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16( - reinterpret_cast&>(a_vec) - .template get_as>()[number{}], - reinterpret_cast&>(b_vec) - .template get_as>()[number{}], - c_vec, - 0, - 0, - 0); - }); + static_for<0, 2, 1>{}([&](auto k) { + c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16( + reinterpret_cast&>(a_vec) + .template get_as>()[number{}], + reinterpret_cast&>(b_vec) + .template get_as>()[number{}], + c_vec, + 0, + 0, + 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -258,20 +347,21 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16 }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; // FP8 -template +template struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base { - using ADataType = AType_; - using BDataType = BType_; - using CDataType = float; + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = AType_; + using BDataType = BType_; + using CDataType = float; using AVecType = ext_vector_t; using BVecType = ext_vector_t; @@ -292,38 +382,120 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base static constexpr index_t kCM1PerLane = 4; // c_vec += a_vec * b_vec - CK_TILE_DEVICE void - operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const { + if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "v", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "a", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "v", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "v", "v") + } + } + else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva) + { + if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "a", "v") + } + else if constexpr(std::is_same_v && std::is_same_v) + { + DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "a", "v") + } + } + else + { #if defined(__gfx94__) - if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); - else if constexpr(std::is_same_v && std::is_same_v) - c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + else if constexpr(std::is_same_v && std::is_same_v) + c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #elif defined(__gfx908__) || defined(__gfx90a__) - static_for<0, 8, 1>{}([&](auto k) { - float a_f32 = - type_convert(reinterpret_cast&>(a_vec) - .template get_as()[number{}]); - float b_f32 = - type_convert(reinterpret_cast&>(b_vec) - .template get_as()[number{}]); + static_for<0, 8, 1>{}([&](auto k) { + float a_f32 = + type_convert(reinterpret_cast&>(a_vec) + .template get_as()[number{}]); + float b_f32 = + type_convert(reinterpret_cast&>(b_vec) + .template get_as()[number{}]); - c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); - }); + c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); + }); #else - ignore = c_vec; - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; #endif + } } // c_vec = a_vec * b_vec @@ -356,20 +528,97 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base }); return c_vec; #else - ignore = a_vec; - ignore = b_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; return CVecType{0.f}; #endif } }; +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +template using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 = - WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base; + +// int8 +template +struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8 +{ + static constexpr WGAttrCtlEnum Ctrl = Ctrl_; + using ADataType = int8_t; + using BDataType = int8_t; + using CDataType = int32_t; + + using AVecType = ext_vector_t; + using BVecType = ext_vector_t; + using CVecType = ext_vector_t; + + static constexpr index_t kM = 32; + static constexpr index_t kN = 32; + static constexpr index_t kK = 16; + + static constexpr index_t kAMLane = 32; + static constexpr index_t kBNLane = 32; + static constexpr index_t kABKLane = 2; + static constexpr index_t kABKPerLane = 8; + + static constexpr index_t kCMLane = 2; + static constexpr index_t kCNLane = 32; + static constexpr index_t kCM0PerLane = 4; + static constexpr index_t kCM1PerLane = 4; + + // c_vec += a_vec * b_vec + template + CK_TILE_DEVICE void operator()(CVecType& c_vec, + const AVecType& a_vec, + const BVecType& b_vec, + bool_constant = {}) const + { + DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl) + else + { +#if defined(__gfx94__) + c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8( + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); +#elif defined(__gfx908__) || defined(__gfx90a__) + static_for<0, 8, 1>{}([&](auto k) { + float a_f32 = + type_convert(reinterpret_cast&>(a_vec) + .template get_as()[number{}]); + float b_f32 = + type_convert(reinterpret_cast&>(b_vec) + .template get_as()[number{}]); + + c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0); + }); +#else + ck_tile::ignore = c_vec; + ck_tile::ignore = a_vec; + ck_tile::ignore = b_vec; +#endif + } + } + + // c_vec = a_vec * b_vec + CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const + { + CVecType c_vec{0}; + operator()(c_vec, a_vec, b_vec); + return c_vec; + } +}; + +#undef DISPATCH_MFMA_ } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp index 4183d9cb95..99cd5d787e 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher; // clang-format off // fp16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; }; // bf16 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; }; // fp8 -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; -template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; }; +template<> struct WarpGemmMfmaDispatcher { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; }; // clang-format on } // namespace impl diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp index eb9dbf127d..182d023a00 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp @@ -31,11 +31,21 @@ struct WarpGemmImpl using BWarpTensor = static_distributed_tensor; using CWarpTensor = static_distributed_tensor; - CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const + CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { - using AVec = ext_vector_t; - using BVec = ext_vector_t; - using CVec = ext_vector_t; + return WarpGemmAttribute_::get_num_of_access(); + } + + template + CK_TILE_DEVICE void + operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant = {}) const + { + static_assert(detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v); + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; constexpr auto I0 = number<0>{}; @@ -44,18 +54,49 @@ struct WarpGemmImpl auto c_vec = c.get_thread_buffer().template get_as()[I0]; // c_vec += a_vec * b_vec - WarpGemmAttribute{}(c_vec, a_vec, b_vec); + WarpGemmAttribute{}(c_vec, a_vec, b_vec, bool_constant{}); c.get_thread_buffer().template set_as(I0, c_vec); } - CK_TILE_DEVICE auto operator()(const AWarpTensor& a, const BWarpTensor& b) const + template + CK_TILE_DEVICE void operator()(CTensor& c, + const ATensor& a, + const BTensor& b, + number, + bool_constant = {}) const { - CWarpTensor c; + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; - using AVec = ext_vector_t; - using BVec = ext_vector_t; - using CVec = ext_vector_t; + constexpr auto I0 = number<0>{}; + + const auto a_vec = a.get_thread_buffer().template get_as()[I0]; + const auto b_vec = b.get_thread_buffer().template get_as()[I0]; + auto c_vec = c.get_thread_buffer().template get_as()[I0]; + + // c_vec += a_vec * b_vec + WarpGemmAttribute{}(c_vec, a_vec, b_vec, number{}, bool_constant{}); + + c.get_thread_buffer().template set_as(I0, c_vec); + } + + template + CK_TILE_DEVICE auto operator()(const ATensor& a, const BTensor& b) const + { + using CTensor = CWarpTensor; + static_assert(detail::is_similiar_distributed_tensor_v && + detail::is_similiar_distributed_tensor_v); + CTensor c; + + using AVec = ext_vector_t; + using BVec = ext_vector_t; + using CVec = ext_vector_t; constexpr auto I0 = number<0>{}; diff --git a/include/ck_tile/ops/moe_sorting.hpp b/include/ck_tile/ops/moe_sorting.hpp deleted file mode 100644 index b74607f061..0000000000 --- a/include/ck_tile/ops/moe_sorting.hpp +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" -#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp" -#include "ck_tile/ops/common/generic_2d_block_shape.hpp" -#include "ck_tile/ops/common/tensor_layout.hpp" From b6bcd76d881421af2f04246b1e4bbac45b7ce3b9 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:45:14 +0100 Subject: [PATCH 15/52] CK-Tile first draft of universal block gemm with interwave & intrawave scheduler (#1676) * Block universal gemm. * Universal block gemm with interwave scheduler - draft. * Refactoring * Move a/b_warp_tiles into BlockGemmImpl * set BlockGemmImpl as a class member * Change tile size for more suitable to memory bound cases. * Introduce kKPerThread to WarpGemm * Add documentation comment. * Fix Interwave scheduler block gemm. * Add compute/memory friendly tile configuration. * Clean * New tile configurations in gemm mem example. * Add more static checks and fix loop order in block gemm. * Add more static checks and use warp gemm mfma dispatcher. * Add default scheduler block gemm. * Remove logging in example. --- example/01_gemm/run_gemm_example_v2.inc | 2 +- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 33 +- example/ck_tile/03_gemm/run_gemm_example.inc | 22 +- include/ck_tile/ops/gemm.hpp | 1 + .../block/block_universal_gemm_as_bs_cr.hpp | 661 ++++++++++++++++++ .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 12 +- .../gemm_pipeline_ag_bg_cr_scheduler.hpp | 2 + ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 38 +- .../gemm/pipeline/gemm_pipeline_problem.hpp | 2 + .../gemm/warp/warp_gemm_attribute_mfma.hpp | 55 +- .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp | 7 +- 11 files changed, 779 insertions(+), 56 deletions(-) create mode 100644 include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index 71524fdecf..5b6969f1d9 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -261,7 +261,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) if(config.time_kernel) { ave_time = - invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4}); + invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 50, 100, true, 4}); std::size_t flop = 2_uz * M * N * K; std::size_t num_btype = diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp index ff9d8bad32..97d150412d 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -17,9 +17,24 @@ template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { - // ToDo: This will be modified by the codegen code later. +#if 1 + // Memory friendly for Interwave scheduler constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t N_Tile = 32; + constexpr ck_tile::index_t K_Tile = 64; + + constexpr ck_tile::index_t M_Warp = 4; + constexpr ck_tile::index_t N_Warp = 1; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + +#else + // Compute friendly for Intrawave scheduler + constexpr ck_tile::index_t M_Tile = 256; + constexpr ck_tile::index_t N_Tile = 256; constexpr ck_tile::index_t K_Tile = 32; constexpr ck_tile::index_t M_Warp = 2; @@ -28,12 +43,12 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; + constexpr ck_tile::index_t K_Warp_Tile = 16; +#endif - // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadM = true; - constexpr bool kPadN = true; - constexpr bool kPadK = true; + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; constexpr int kBlockPerCu = 1; @@ -174,8 +189,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { std::ostringstream err; err << "When there's no hot loop, this tail number \"" << tail_num - << "\" is not supported! " << __FILE__ << ":" << __LINE__ - << ", in function: " << __func__; + << "\" is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages + << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; throw std::runtime_error(err.str()); } } diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 8db131738b..5199c1e3ef 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -31,15 +31,13 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, float ave_time = gemm_calc( args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); - std::string op_name{"Gemm{MemBoundPipeline}"}; - std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; @@ -114,7 +112,6 @@ int run_gemm_example_with_layouts(int argc, f_host_tensor_descriptor(M, N, stride_C, CLayout{})); // TODO: add different init types - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); @@ -202,14 +199,15 @@ int run_gemm_example(int argc, char* argv[]) { return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); } - else if(a_layout == "C" && b_layout == "C") - { - return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); - } - else if(a_layout == "C" && b_layout == "R") - { - return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); - } + // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not + // work. else if(a_layout == "C" && b_layout == "C") + // { + // return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + // } + // else if(a_layout == "C" && b_layout == "R") + // { + // return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + // } else { throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index ac74782a3a..9a033ee2de 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -22,6 +22,7 @@ #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp" +#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp new file mode 100644 index 0000000000..5f98a7a0ba --- /dev/null +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" + +namespace ck_tile { + +// A is block window on shared memory +// B is block window on shared memory +// C is block distributed tensor +template +struct BlockUniversalGemmAsBsCr +{ + private: + // TODO: This should be in Policy - UniversalGemmPolicyBase ? + template + struct GemmTraits_ + { + using Problem = remove_cvref_t; + using Policy = remove_cvref_t; + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t kBlockSize = Problem::kBlockSize; + static constexpr auto Scheduler = Problem::Scheduler; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp(); + + using WarpGemm = remove_cvref_t())>; + + static constexpr index_t MWarp = config.template at<1>(); + static constexpr index_t NWarp = config.template at<2>(); + + static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}), + "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!"); + static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}), + "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!"); + static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}), + "Error! WarpGemm's M is not consisten with BlockGemmShape!"); + static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}), + "Error! WarpGemm's N is not consisten with BlockGemmShape!"); + + static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM); + static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN); + static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK; + + static_assert(MIterPerWarp * MWarp * WarpGemm::kM == MPerBlock, + "Error! Warps should cover all Block tile!"); + static_assert(NIterPerWarp * NWarp * WarpGemm::kN == NPerBlock, + "Error! Warps should cover all Block tile!"); + + static constexpr index_t MPerBlockPerIter = MWarp * WarpGemm::kM; + static constexpr index_t NPerBlockPerIter = NWarp * WarpGemm::kN; + static constexpr index_t KPerBlockPerIter = WarpGemm::kK; + + using AWarpTileDistr = remove_cvref_t; + using BWarpTileDistr = remove_cvref_t; + + using AWarpTile = + remove_cvref_t(AWarpTileDistr{}))>; + using BWarpTile = + remove_cvref_t(BWarpTileDistr{}))>; + + // TODO: Should we have two policies? Interwave & Intrawave ?? + static constexpr index_t InterWaveSchedulingMacClusters = 1; + + static constexpr index_t KPack = WarpGemm::kKPerThread; + static constexpr index_t KPerThread = KPerBlock / WarpGemm::kK * KPack; + static constexpr index_t KRepeat = KPerThread / KPack; + }; + + public: + using Traits = GemmTraits_; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + using WarpGemm = remove_cvref_t; + + static constexpr index_t KIterPerWarp = Traits::KIterPerWarp; + static constexpr index_t MIterPerWarp = Traits::MIterPerWarp; + static constexpr index_t NIterPerWarp = Traits::NIterPerWarp; + + static constexpr index_t MWarp = Traits::MWarp; + static constexpr index_t NWarp = Traits::NWarp; + + static constexpr auto Scheduler = Traits::Scheduler; + + private: + template + struct BlockGemmImpl + { + }; + + template + struct BlockGemmImpl + { + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + // TODO: I don't have to move 0,0 window! + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter)); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter)); + + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + }); + }); + }); + } + }; + + template + struct BlockGemmImpl + { + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_tiles_; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_tiles_; + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + // TODO: I don't have to move 0,0 window! + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + // read A warp tensor from A block window + load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); + }); + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read B warp tensor from B Block window + load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); + }); + }); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + [[maybe_unused]] const ASmemBlockWindow& a_block_window, + [[maybe_unused]] const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kIter], + b_warp_tiles_[nIter][kIter]); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + }); + }); + }); + } + }; + + template + struct BlockGemmImpl + { + static constexpr index_t KPerThread = GemmTraits::KPerThread; + static constexpr index_t NumMacClusters = GemmTraits::InterWaveSchedulingMacClusters; + static constexpr index_t KPerInnerLoop = + ck_tile::max(KPerThread / NumMacClusters, GemmTraits::KPack); + // TODO: do we really need this?? Are there any cases when this would be >=1 ?? + // Would we need InterWaveSchedulingMacClusters > 1 ??? + static constexpr index_t KRepeat = KPerThread / KPerInnerLoop; + static constexpr index_t KInnerLoopIter = KPerInnerLoop / GemmTraits::KPack; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_tiles_; + + statically_indexed_array< + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_tiles_; + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + "MPerBlock, NPerBlock, KPerBlock defined in " + " BlockGemmShape are different from A/B block smem windows apropriate dims!"); + + static_assert(std::is_same_v && + std::is_same_v, + "The ADataType and BDataType as defined in " + "traits should be the same as correspoinding block window data type!"); + + const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + + // TODO: refactor warp_window tile type to class member as it should be + // compile-time known information. + auto a_warp_window_tmp = make_tile_window( + a_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + + multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + + using AWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::AWarpTile::get_num_of_dimension() == + AWarpWindow::get_num_of_dimension(), + "AWarpWindow number of dimensions must be equal to " + "AWarpTile number of dimensions!"); + static_assert(GemmTraits::AWarpTile::get_lengths() == + AWarpWindow{}.get_window_lengths(), + "AWarpWindow lengths must be equal to AWarpTile lengths!"); + + statically_indexed_array, + GemmTraits::MIterPerWarp> + a_warp_windows; + + // construct B-warp-window + auto b_warp_window_tmp = make_tile_window( + b_block_window.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + + multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + + using BWarpWindow = remove_cvref_t; + + static_assert(GemmTraits::BWarpTile::get_num_of_dimension() == + BWarpWindow::get_num_of_dimension(), + "BWarpWindow number of dimensions must be equal to " + "BWarpTile number of dimensions!"); + static_assert(GemmTraits::BWarpTile::get_lengths() == + BWarpWindow{}.get_window_lengths(), + "BWarpWindow lengths must be equal to BWarpTile lengths!"); + + statically_indexed_array, + GemmTraits::NIterPerWarp> + b_warp_windows; + + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + a_warp_windows(mIter)(kIter) = a_warp_window_tmp; + + move_tile_window(a_warp_windows(mIter)(kIter), + {mIter * GemmTraits::MPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + b_warp_windows(nIter)(kIter) = b_warp_window_tmp; + + move_tile_window(b_warp_windows(nIter)(kIter), + {nIter * GemmTraits::NPerBlockPerIter, + kIter * GemmTraits::KPerBlockPerIter}); + }); + }); + + // TODO check if a_warp_tiles has same desc as a_warp_window + static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + // read A warp tensor from A block window + load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); + }); + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read B warp tensor from B Block window + load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); + }); + }); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + static_assert( + std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + + using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; + using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + // hot loop: + static_for<0, KRepeat, 1>{}([&](auto kIter) { + LocalPrefetch(a_block_window, b_block_window); + __builtin_amdgcn_sched_barrier(0); + // NOTE: Synchronize threads in a workgroup at the start of each MAC + // cluster, but except the first, as we can shorten non-MAC cluster a bit + // and there's no observable negative impact. The desired effect is waves in + // a workgroup executing MAC in sync. This avoids some out-of-sync waves + // hijacking MAC resource from other workgroups and reducing the chance of + // latency hiding by waiting for the rest of the workgroup at the eventual + // sync point. + if constexpr(kIter.value != 0 || KRepeat == 1) + { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_sched_barrier(0); + } + + static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) { + static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + // read C warp tensor from C block tensor- + CWarpTensor c_warp_tensor; + + c_warp_tensor.get_thread_buffer() = + c_block_tensor.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + // The block_sync_lds() here performs double duty: + // A) safeguard against data hazard because barrier from + // blockwise_gemm is moved here B) reduce VMEM FIFO congestion + // by applying small delays to different wavefronts It is + // performed near the end of MAC cluster to minimize lgkmcnt + // penalty + if constexpr(kIter.value == KRepeat - 1 && + kInnerIter.value == KInnerLoopIter - 1 && + mIter.value == GemmTraits::MIterPerWarp - 1 && + nIter.value == GemmTraits::NIterPerWarp - 1) + { + __builtin_amdgcn_sched_barrier(0); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(0); + } + // warp GEMM + typename GemmTraits::WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kInnerIter], + b_warp_tiles_[nIter][kInnerIter]); + + // write C warp tensor into C block tensor + c_block_tensor.set_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths), + c_warp_tensor.get_thread_buffer()); + + if constexpr(kInnerIter.value == 0 && mIter.value == 0 && + nIter.value == 0) + { + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(0); + } + }); + }); + }); + + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(0); + }); + } + }; + + public: + CK_TILE_DEVICE static constexpr auto MakeCBlockTile() + { + constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding< + sequence<>, + tuple, sequence>, + tuple>, + tuple>, + sequence<1, 2>, + sequence<0, 0>>{}; + + constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding( + c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{}); + constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode); + auto c_block_tensor = make_static_distributed_tensor(c_block_dstr); + + return c_block_tensor; + } + + template + CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window); + } + + // C += A * B + template + CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor, + const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + } + + // C = A * B + template + CK_TILE_DEVICE auto operator()(const ASmemBlockWindow& a_block_window, + const BSmemBlockWindow& b_block_window) + { + auto c_block_tensor = MakeCBlockTile(); + block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + return c_block_tensor; + } + + private: + BlockGemmImpl block_gemm_impl_{}; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 85c5c58056..4634e9dcb9 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -247,8 +247,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem b_lds_block, make_tuple(number{}, number{}), {0, 0}); // Block GEMM - constexpr auto block_gemm = BlockGemm(); - auto c_block_tile = block_gemm.MakeCBlockTile(); + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); @@ -290,7 +290,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem { static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); @@ -318,7 +318,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem static_for<1, tail_num, 1>{}([&](auto prefetch_idx) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); @@ -331,14 +331,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem }); block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); }; if constexpr(TailNum == TailNumber::One) { block_sync_lds(); - // block_gemm.LocalPrefetch(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); } else if constexpr(TailNum == TailNumber::Two) diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp index 5e93ca21c0..6f51e6b8a9 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp @@ -11,6 +11,7 @@ namespace ck_tile { enum struct GemmPipelineScheduler { + Default, Intrawave, Interwave, }; @@ -43,6 +44,7 @@ inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineSch { switch(s) { + case ck_tile::GemmPipelineScheduler::Default: os << "Default"; break; case ck_tile::GemmPipelineScheduler::Intrawave: os << "Intrawave"; break; case ck_tile::GemmPipelineScheduler::Interwave: os << "Interwave"; break; default: os << ""; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp index c765b3ce9d..b475ebb7bd 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" namespace ck_tile { @@ -52,6 +53,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM; constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK; + // TODO: this 8 is AK1! should be a policy parameter! constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, number<8>{}), make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}), @@ -264,6 +266,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error."); static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error."); constexpr index_t M0 = MPerBlock / (M2 * M1); + static_assert(M0 * M1 * M2 == MPerBlock, + "Incorrect M0, M2, M1 configuration! " + "M0, M1, M2 must cover whole MPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, @@ -277,6 +282,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { constexpr index_t M0 = BlockSize / get_warp_size(); constexpr index_t M1 = MPerBlock / (M2 * M0); + static_assert(M0 * M1 * M2 == MPerBlock, + "Incorrect M0, M1, M2 configuration! " + "M0, M1, M2 must cover whole MPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, tuple, sequence>, @@ -350,6 +358,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error."); static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error."); constexpr index_t N0 = NPerBlock / (N2 * N1); + static_assert(N0 * N1 * N2 == NPerBlock, + "Incorrect N0, N1, N2 configuration! " + "N0, N1, N2 must cover whole NPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, @@ -364,7 +375,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy { constexpr index_t N0 = BlockSize / get_warp_size(); constexpr index_t N1 = NPerBlock / (N2 * N0); - + static_assert(N0 * N1 * N2 == NPerBlock, + "Incorrect N0, N1, N2 configuration! " + "N0, N1, N2 must cover whole NPerBlock!"); return make_static_tile_distribution( tile_distribution_encoding, tuple, sequence>, @@ -475,9 +488,28 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { - using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1DefaultPolicy; + constexpr bool TransposeC = false; + constexpr auto I0 = number<0>{}; + constexpr auto I1 = number<1>{}; + constexpr auto I2 = number<2>{}; - return BlockGemmASmemBSmemCRegV1{}; + using AccDataType = float; + using BlockWarps = typename Problem::BlockGemmShape::BlockWarps; + using WarpTile = typename Problem::BlockGemmShape::WarpTile; + using WarpGemm = WarpGemmMfmaDispatcher; + using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy; + + return BlockUniversalGemmAsBsCr{}; } }; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp index 3c43790bd6..bf51577aeb 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp @@ -33,6 +33,8 @@ struct GemmPipelineProblemBase static constexpr bool kPadN = GemmTraits::kPadN; static constexpr bool kPadK = GemmTraits::kPadK; + static constexpr auto Scheduler = GemmPipelineScheduler::Default; + CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA() { if constexpr(std::is_same_v) diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp index 0a8d2dfbe3..a9e466a796 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -21,9 +21,10 @@ struct WarpGemmAtrributeMfma using BVecType = typename Impl::BVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -86,9 +87,10 @@ struct WarpGemmAtrributeMfmaIterateK ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -197,9 +199,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution using BVecType = typename Impl::AVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -260,9 +263,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB using BVecType = typename Impl::AVecType; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK; + static constexpr index_t kKPerThread = Impl::kABKPerLane; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; } @@ -330,9 +334,10 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -444,10 +449,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kN; - static constexpr index_t kN = Impl::kM; - static constexpr index_t kK = Impl::kK * kKIter; - static constexpr index_t SFactor = SFactor_; // group how many CM1 together + static constexpr index_t kM = Impl::kN; + static constexpr index_t kN = Impl::kM; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; + static constexpr index_t SFactor = SFactor_; // group how many CM1 together CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } @@ -583,10 +589,11 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA ext_vector_t::vector_size * kKIter>; using CVecType = typename Impl::CVecType; - static constexpr index_t kM = Impl::kM; - static constexpr index_t kN = Impl::kN; - static constexpr index_t kK = Impl::kK * kKIter; - static constexpr index_t SFactor = SFactor_; // group how many CM1 together + static constexpr index_t kM = Impl::kM; + static constexpr index_t kN = Impl::kN; + static constexpr index_t kK = Impl::kK * kKIter; + static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter; + static constexpr index_t SFactor = SFactor_; // group how many CM1 together CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; } diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp index 182d023a00..f9d50ed35e 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,6 +14,11 @@ struct WarpGemmImpl static constexpr index_t kM = WarpGemmAttribute::kM; static constexpr index_t kN = WarpGemmAttribute::kN; static constexpr index_t kK = WarpGemmAttribute::kK; + /// @brief The number of elements in K dimension processed by single thread in wavefront. + /// + /// @note Note that WarpGemm may run MFMA instruction multiple times (on different K). + /// In such situation this value reflects this fact. + static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread; using ADataType = typename WarpGemmAttribute::ADataType; using BDataType = typename WarpGemmAttribute::BDataType; From cf2d635ea27c074e7025896514c4b94034d370cc Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Tue, 26 Nov 2024 20:37:54 +0800 Subject: [PATCH 16/52] [CK_TILE] Fix incorrect computation of group mode PagedAttention (#1688) * Allow getting batch size from splitkv tile partitioner * Fix wrong paged-kvcache impl for group mode * Fix wrong example code for page-kvcache * Undo changes in fmha_fwd.cpp * Always use 2D block table * Add is_gappy kernel argument for paged-kvcache The is_gappy argument is used for differentiating seqstart_k_ptr usage in flash-attention & xformers * Remove out-of-date comments * Remove no-longer used method * Fix wrong # page-block calculation * Fix wrong comment --------- Co-authored-by: Qianfeng --- example/ck_tile/01_fmha/fmha_fwd.cpp | 1 + example/ck_tile/01_fmha/fmha_fwd.hpp | 12 +++ .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 56 +++++++----- .../fmha_fwd_splitkv_tile_partitioner.hpp | 10 +-- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 90 +++++++++++-------- 5 files changed, 105 insertions(+), 64 deletions(-) diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 00e0a16536..1f0d73d950 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -1046,6 +1046,7 @@ bool run(const ck_tile::ArgParser& arg_parser) (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr); args.batch_stride_block_table = batch_stride_block_table; args.page_block_size = page_block_size; + args.is_gappy = false; // use 'false' for flash-attention integration args.cache_batch_idx = (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr); diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 704453baa4..8a821b9177 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -165,6 +165,8 @@ struct fmha_fwd_splitkv_args void* block_table_ptr; ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr + bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not + // nullptr. const void* cache_batch_idx; @@ -173,12 +175,21 @@ struct fmha_fwd_splitkv_args // seqlen_k = kargs.seqlen_k // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] + // or kargs.seqlen_k_ptr[b] + // // batch mode (kvcache): // seqlen_q = kargs.seqlen_q // seqlen_k = kargs.seqlen_k_ptr[b] // group mode (kvcache): // seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] + // + // when is_gappy=true: + // seqlen_k = kargs.seqlen_k_ptr[b] + // seqstart_k_ptr[b] now store local offset of each batch + // + // when is_gappy=false: // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] + // or kargs.seqlen_k_ptr[b] const void* seqstart_q_ptr; const void* seqstart_k_ptr; const void* seqlen_k_ptr; @@ -395,6 +406,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) args.block_table_ptr, args.batch_stride_block_table, args.page_block_size, + args.is_gappy, args.scale_s, args.scale_p, args.stride_q, diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 3c4e02d08b..dcb671d81e 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -172,13 +172,18 @@ struct FmhaFwdSplitKVKernel float scale_p; }; - struct PageBlockTableKargs + struct CommonPageBlockTableKargs { const int32_t* block_table_ptr; ck_tile::index_t batch_stride_block_table; ck_tile::index_t page_block_size; }; + struct GroupModePageBlockTableKargs : CommonPageBlockTableKargs + { + bool is_gappy = false; + }; + struct CacheBatchIdxKargs { const int32_t* cache_batch_idx; @@ -193,7 +198,7 @@ struct FmhaFwdSplitKVKernel EmptyKargs<0>>>, std::conditional_t>, std::conditional_t>, - std::conditional_t + std::conditional_t { const int32_t* seqlen_k_ptr; @@ -215,7 +220,7 @@ struct FmhaFwdSplitKVKernel EmptyKargs<0>>>, std::conditional_t>, std::conditional_t>, - std::conditional_t> + std::conditional_t> { const int32_t* seqstart_q_ptr; const int32_t* seqstart_k_ptr; @@ -375,6 +380,7 @@ struct FmhaFwdSplitKVKernel const void* block_table_ptr, ck_tile::index_t batch_stride_block_table, ck_tile::index_t page_block_size, + bool is_gappy, float scale_s, float scale_p, ck_tile::index_t stride_q, @@ -461,6 +467,7 @@ struct FmhaFwdSplitKVKernel kargs.block_table_ptr = reinterpret_cast(block_table_ptr); kargs.batch_stride_block_table = batch_stride_block_table; kargs.page_block_size = page_block_size; + kargs.is_gappy = is_gappy; } return kargs; @@ -495,11 +502,13 @@ struct FmhaFwdSplitKVKernel const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; - long_index_t batch_offset_k = 0; - long_index_t batch_offset_v = 0; + long_index_t batch_offset_k = 0; // unused for paged-kvcache + long_index_t batch_offset_v = 0; // unused for paged-kvcache long_index_t batch_offset_bias = 0; long_index_t batch_offset_lse_acc = 0; long_index_t batch_offset_o_acc = 0; + index_t kv_l2p_offset = + 0; // logical-to-physical offset of seqlen_k coordinate. only used for paged-kvcache if constexpr(kIsGroupMode) { @@ -508,22 +517,14 @@ struct FmhaFwdSplitKVKernel const long_index_t key_start = kargs.seqstart_k_ptr[i_batch]; batch_offset_q = query_start * kargs.stride_q; - if constexpr(kIsPagedKV) + batch_offset_k = key_start * kargs.stride_k; + if constexpr(std::is_same_v) { - batch_offset_k = static_cast(i_batch) * kargs.batch_stride_k; - batch_offset_v = static_cast(i_batch) * kargs.batch_stride_v; + batch_offset_v = key_start * kargs.stride_v; } else { - batch_offset_k = key_start * kargs.stride_k; - if constexpr(std::is_same_v) - { - batch_offset_v = key_start * kargs.stride_v; - } - else - { - batch_offset_v = key_start; - } + batch_offset_v = key_start; } if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) { @@ -551,6 +552,15 @@ struct FmhaFwdSplitKVKernel { kargs.seqlen_k = kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch]; } + + if constexpr(kIsPagedKV) + { + if(kargs.is_gappy) + { + // seqstart_k_ptr has different meaning in this case + kv_l2p_offset = kargs.seqstart_k_ptr[i_batch]; + } + } } else { @@ -703,7 +713,7 @@ struct FmhaFwdSplitKVKernel reinterpret_cast(kargs.block_table_ptr) + i_batch_ * kargs.batch_stride_block_table; const index_t num_blocks = - integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size); + integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size); const long_index_t fixed_offset = static_cast(i_nhead_ / kargs.nhead_ratio_qk) * @@ -718,7 +728,8 @@ struct FmhaFwdSplitKVKernel kargs.page_block_size, k_dram, make_k_dram(nullptr, - kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size)); + (kv_l2p_offset + kargs.seqlen_k) - + (num_blocks - 1) * kargs.page_block_size)); } else { @@ -733,7 +744,7 @@ struct FmhaFwdSplitKVKernel reinterpret_cast(kargs.block_table_ptr) + i_batch_ * kargs.batch_stride_block_table; const index_t num_blocks = - integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size); + integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size); const long_index_t fixed_offset = static_cast(i_nhead_ / kargs.nhead_ratio_qk) * @@ -748,7 +759,8 @@ struct FmhaFwdSplitKVKernel kargs.page_block_size, v_dram, make_v_dram(nullptr, - kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size)); + (kv_l2p_offset + kargs.seqlen_k) - + (num_blocks - 1) * kargs.page_block_size)); } else { @@ -896,6 +908,7 @@ struct FmhaFwdSplitKVKernel mask, position_encoding, kargs.scale_s, + kv_l2p_offset, smem_ptr); } else @@ -912,6 +925,7 @@ struct FmhaFwdSplitKVKernel mask, position_encoding, kargs.scale_s, + kv_l2p_offset, smem_ptr); } }(); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp index 675a31019e..5a52fa0f67 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp @@ -18,11 +18,11 @@ struct FmhaFwdSplitKVTilePartitioner static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1; static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1; - __host__ static constexpr auto GridSize(ck_tile::index_t batch_size, - ck_tile::index_t nhead, - ck_tile::index_t max_seqlen_q, - ck_tile::index_t hdim_v, - ck_tile::index_t num_splits) + CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size, + ck_tile::index_t nhead, + ck_tile::index_t max_seqlen_q, + ck_tile::index_t hdim_v, + ck_tile::index_t num_splits) { // TODO: this may need tuning return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) * diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 4e8d8694d7..04aa85644d 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -143,6 +143,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS FmhaMask mask, PositionEncoding position_encoding, float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate void* smem_ptr) const { static_assert( @@ -211,16 +212,16 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS set_tile(m, -numeric::infinity()); clear_tile(l); - const auto q_origin = q_dram_window.get_window_origin(); - const auto [seqlen_k_start, seqlen_k_end] = mask.GetTileRangeAlongX( + const auto q_origin = q_dram_window.get_window_origin(); + const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX( q_origin.at(number<0>{}), number{}, number{}, num_splits, i_split); // check early exit if no work to do if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits) { - const index_t original_num_total_loop = - integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0); - if(original_num_total_loop <= 0) + const index_t logical_num_total_loop = + integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0); + if(logical_num_total_loop <= 0) { if constexpr(kStoreLSE) { @@ -239,33 +240,41 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS } } - // make sure the first tile is completely located in page-block - const index_t adjusted_seqlen_k_start = [&, seqlen_k_start_ = seqlen_k_start] { - if constexpr(kIsPagedKV) - { - return kN0 * integer_divide_floor(seqlen_k_start_, kN0); - } - else - { - return seqlen_k_start_; - } - }(); + const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset; + const index_t physical_seqlen_k_end = logical_seqlen_k_end + kv_l2p_offset; + // make sure the first tile is completely located in page-block (page-block size should be + // divisible by kN0) + // relationship between each *_start variables: aligned_physical_seqlen_k_start <= + // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start + const index_t aligned_physical_seqlen_k_start = + [&, physical_seqlen_k_start_ = physical_seqlen_k_start] { + if constexpr(kIsPagedKV) + { + return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0); + } + else + { + return physical_seqlen_k_start_; + } + }(); const index_t num_total_loop = - integer_divide_ceil(seqlen_k_end - adjusted_seqlen_k_start, kN0); + integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0); auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window( - k_dram_block_window_lengths, {adjusted_seqlen_k_start, 0}); + k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0}); const auto bias_origin = bias_dram_block_window_tmp.get_window_origin(); auto bias_dram_window = make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(), bias_dram_block_window_tmp.get_window_lengths(), - {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N + {bias_origin.at(number<0>{}), + logical_seqlen_k_start - (physical_seqlen_k_start - + aligned_physical_seqlen_k_start)}, // M/N Policy::template MakeBiasDramTileDistribution()); auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window( v_dram_block_window_lengths, - {0, adjusted_seqlen_k_start}, // TODO: hdim split? + {0, aligned_physical_seqlen_k_start}, // TODO: hdim split? Policy::template MakeVDramTileDistribution()); auto q_tile = tile_elementwise_in(q_element_func, q); @@ -379,7 +388,8 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS constexpr auto i_j_idx = make_tuple(idx0, idx1); s_acc(i_j_idx) *= scale_s; - position_encoding.update(s_acc(i_j_idx), row, col); + // position_encoding accept only logical coordinates, do conversion here + position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset); }); }); } @@ -397,29 +407,31 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS { const auto k_origin = k_page_block_navigator.to_global_window_origin( i_page_block_k, k_dram_block_window.get_window_origin()); - set_tile_if(s_acc, - -numeric::infinity(), - [&, seqlen_k_start_ = seqlen_k_start, seqlen_k_end_ = seqlen_k_end]( - auto tile_idx) { - const auto col = - k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); - if constexpr(kIsPagedKV) - { - return col < seqlen_k_start_ || seqlen_k_end_ <= col; - } - else - { - return seqlen_k_end_ <= col; - } - }); + set_tile_if( + s_acc, + -numeric::infinity(), + [&, + physical_seqlen_k_start_ = physical_seqlen_k_start, + physical_seqlen_k_end_ = physical_seqlen_k_end](auto tile_idx) { + const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); + if constexpr(kIsPagedKV) + { + return col < physical_seqlen_k_start_ || physical_seqlen_k_end_ <= col; + } + else + { + return physical_seqlen_k_end_ <= col; + } + }); } if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { const auto k_origin = k_page_block_navigator.to_global_window_origin( i_page_block_k, k_dram_block_window.get_window_origin()); + // mask accept only logical coordinates, do conversion here bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}), - k_origin.at(number<0>{}), + k_origin.at(number<0>{}) - kv_l2p_offset, number{}, number{}); if(need_perpixel_check) @@ -428,7 +440,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS s_acc, -numeric::infinity(), [&](auto tile_idx) { const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{}); - return mask.IsOutOfBound(row, col); + return mask.IsOutOfBound(row, col - kv_l2p_offset); }); } } @@ -659,6 +671,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS FmhaMask mask, PositionEncoding position_encoding, float scale_s, + index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate void* smem_ptr) const { return operator()(q_dram_block_window_tmp, @@ -681,6 +694,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS mask, position_encoding, scale_s, + kv_l2p_offset, smem_ptr); } }; From b70f367f8051e0c66071a25ab95a77e076762808 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Tue, 26 Nov 2024 13:56:32 +0100 Subject: [PATCH 17/52] Add check for bf16 splitk support for grouped gemm splitk (#1673) * add check for bf16 splitk support for grouped gemm splitk * Update if condition --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp index 6d9d1459c8..cb0afbb08d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp @@ -538,6 +538,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK && arg.K_BATCH > 1 && !is_bf16_atomic_supported()) + { + return false; + } + bool supported = true; for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i) { From bfe983a1518935ef8d81066b540b8aea51b8e883 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:36:53 +0100 Subject: [PATCH 18/52] Change block gemm pipeline local prefill loop order. (#1692) * Fix loop order. * Fix loop order in pipeline v4 --- .../blockwise_gemm_pipeline_xdlops_v2.hpp | 130 +++++++++--------- .../blockwise_gemm_pipeline_xdlops_v4.hpp | 65 +++++---- 2 files changed, 96 insertions(+), 99 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp index 711c47854a..54edf0c353 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp @@ -269,15 +269,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -341,14 +340,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -396,14 +395,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -447,14 +446,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_buf); }); }); @@ -760,15 +759,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); // NOTE: Synchronize threads in a workgroup at the start of each MAC @@ -866,14 +864,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); @@ -942,14 +940,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); @@ -1018,14 +1016,14 @@ struct BlockwiseGemmXdlops_pipeline_v2{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, k0, I0), + b_thread_buf); }); __builtin_amdgcn_sched_barrier(0); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp index bd5a1bedf5..e8d1051111 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp @@ -305,14 +305,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(I0), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(I0)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(I0), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(I0)); }); }); @@ -356,15 +356,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run( - b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); @@ -437,14 +436,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); @@ -496,14 +495,14 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf.At(lds_read_buf), - b_thread_desc_, - make_tuple(n0, I0, k, I0), - b_thread_bufs(lds_read_reg_buf)); - }); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, Number{}), + b_block_buf.At(lds_read_buf), + b_thread_desc_, + make_tuple(n0, I0, k, I0), + b_thread_bufs(lds_read_reg_buf)); }); }); From abae2afc721d9b335ef07d7227e0f9e55b1c575a Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 27 Nov 2024 05:01:15 +0800 Subject: [PATCH 19/52] support max3 in smoothquant and add+ rmsnorm + rdquant (#1654) * Fix cmake example build * Support max3 in smoothquant one pass * support max3 in two pass * support max3 in add_rmsnorm_rdquant --- example/ck_tile/12_smoothquant/CMakeLists.txt | 4 +- ...msnorm2d_rdquant_fwd_pipeline_one_pass.hpp | 37 +++++++++++++++---- ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 26 ++++++++++--- .../smoothquant_pipeline_one_pass.hpp | 30 +++++++++++++-- .../smoothquant_pipeline_two_pass.hpp | 16 +++++++- 5 files changed, 94 insertions(+), 19 deletions(-) diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt index 09a56c6dab..3849833aca 100644 --- a/example/ck_tile/12_smoothquant/CMakeLists.txt +++ b/example/ck_tile/12_smoothquant/CMakeLists.txt @@ -18,7 +18,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC) target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC) -file(GLOB INSTANCE_SRCS instances/*.cpp) -add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS}) add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp) +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS}) diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp index 12a15938ae..24f35d3636 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp @@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass static constexpr bool kSaveX = Problem::kSaveX; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; - static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -69,9 +70,16 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = Policy::template GetBlockReduce2dCrossWarpSync(); @@ -116,8 +124,23 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass }); // compute absmax, each-thread->cross-lane->cross-warp - auto absmax = block_reduce2d( - y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + auto absmax = [&]() { + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + { + return block_reduce2d(y, + reduce_absmax_func.GetIdentityValue(), + reduce_absmax3_func, + sequence<1, 2>{}); + } + else + { + return block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + } + }(); block_reduce2d_sync(absmax, reduce_max_func); block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp index 0dbb20645a..aec7368e27 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp @@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass static constexpr bool kSaveX = Problem::kSaveX; static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; - static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM - static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -76,9 +77,16 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = Policy::template GetBlockReduce2dCrossWarpSync(); @@ -177,7 +185,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass y(idx) = type_convert(y_); }); - block_reduce2d(y, absmax, reduce_absmax_func); + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{}); + else + block_reduce2d(y, absmax, reduce_absmax_func); if constexpr(kSaveX) move_tile_window(x_window, {0, -Block_N}); diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp index d5b3780dea..b2fc240c1d 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp @@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass xscale_window_, Policy::template MakeXScaleBlockTileDistribution()); auto reduce_absmax_func = ReduceOp::AbsMax{}; - auto reduce_max_func = ReduceOp::Max{}; + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; + auto reduce_max_func = ReduceOp::Max{}; + auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); auto block_reduce2d_cross_warp_sync = @@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass xscale); // compute absmax, cross-lane->cross-warp - auto absmax = block_reduce2d( - y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + auto absmax = [&]() { + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + { + return block_reduce2d(y, + reduce_absmax_func.GetIdentityValue(), + reduce_absmax3_func, + sequence<1, 2>{}); + } + else + { + return block_reduce2d( + y, reduce_absmax_func.GetIdentityValue(), reduce_absmax_func); + } + }(); block_reduce2d_sync(absmax, reduce_max_func); block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func); diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp index 7878ef1d34..9e9df663b9 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp @@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync; static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM static constexpr bool kPadN = Problem::kPadN; + static constexpr bool UseMax3 = true; // TODO - Move to trait static constexpr const char* name = []() { if constexpr(kNeedCrossWarpSync) @@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); auto reduce_absmax_func = ReduceOp::AbsMax{}; + auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { + float rtn; + asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)" + : "=v"(rtn) + : "v"(acc_), "v"(v_0_), "v"(v_1_)); + return rtn; + }; auto reduce_max_func = ReduceOp::Max{}; auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); @@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass x, xscale); - block_reduce2d(y, absmax, reduce_absmax_func); + constexpr auto x_size_per_row = + x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{}); + if constexpr(UseMax3 && std::is_same_v && + x_size_per_row % 2 == 0) + block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{}); + else + block_reduce2d(y, absmax, reduce_absmax_func); move_tile_window(x_window, {0, Block_N}); move_tile_window(xscale_window, {Block_N}); From cb8c7f42d6123f548306cbd679c3d18349f10b6d Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:58:35 -0800 Subject: [PATCH 20/52] update mainline compiler branch name (#1696) --- Dockerfile | 4 ++-- Jenkinsfile | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 76e6f0ebea..38a563ce33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV compiler_commit=$compiler_commit RUN sh -c "echo compiler version = '$compiler_version'" && \ sh -c "echo compiler commit = '$compiler_commit'" -RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \ +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ cd llvm-project && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ @@ -124,7 +124,7 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd else echo "using the release compiler"; \ fi -RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \ +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ diff --git a/Jenkinsfile b/Jenkinsfile index 2f790d8e5b..b448a5130b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -133,7 +133,7 @@ def buildDocker(install_prefix){ def image_name = getDockerImageName() echo "Building Docker for ${image_name}" def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " - if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerArgs = dockerArgs + " --no-cache " } echo "Build Args: ${dockerArgs}" @@ -358,7 +358,7 @@ def buildHipClangJob(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') @@ -549,7 +549,7 @@ def Build_CK(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){ + if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } if(params.BUILD_LEGACY_OS){ @@ -737,7 +737,7 @@ def process_results(Map conf=[:]){ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 13 * * * % BUILD_LEGACY_OS=true''' : "" @@ -765,7 +765,7 @@ pipeline { string( name: 'COMPILER_VERSION', defaultValue: '', - description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline-open, or leave blank (default).') + description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).') string( name: 'COMPILER_COMMIT', defaultValue: '', From 061ac0649c75deb315a418466d00dea2c49e65f3 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:02:44 +0100 Subject: [PATCH 21/52] Polished Grouped GEMM APIs and new BF16 instances (#1600) * Few small fixes. * New GroupedGemm instances (BF16) * Unify and refactor GroupedGEMM device API. * Adapt changes to new API. * Adapt grouped gemm profiler. * Accept multiple kbatches for grouped gemm profiler. - delete obsolete two stage as it is now covered by grouped gemm * Update unit test for grouped gemm. * Fix thresholds for BF16 and F8. Unblock tests. * Fix few instances. * Multiple small fixes. * Adapt to new API, check dynamic casting. * Uncomment few data types in grouped gemm profiler. * Fix call to SetDeviceArgs. * Fix profile grouped gemm multiply tile loop. * Fix grouped gemm tile loop kernel args in client examples. * Review comments. --- ...emm_multiply_bias_fastgelu_xdl_bf16_i8.cpp | 2 +- .../grouped_gemm_multiply_xdl_bf16_i8.cpp | 2 +- ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp | 4 +- .../grouped_gemm_multiple_d_xdl_fp16.cpp | 2 +- .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp | 4 +- .../run_grouped_gemm_example.inc | 18 +- .../gpu/device/device_grouped_gemm.hpp | 132 ++++++- .../device/device_grouped_gemm_fixed_nk.hpp | 50 +-- .../device_grouped_gemm_multiple_d_splitk.hpp | 136 ------- .../gpu/device/device_grouped_gemm_splitk.hpp | 20 +- .../device/device_grouped_gemm_tile_loop.hpp | 92 +---- ...ltiple_d_splitk_xdl_cshuffle_two_stage.hpp | 99 +++-- ...gemm_multiple_d_xdl_cshuffle_tile_loop.hpp | 24 +- .../device/impl/device_grouped_gemm_xdl.hpp | 21 +- .../impl/device_grouped_gemm_xdl_fixed_nk.hpp | 72 +++- ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 35 +- include/ck/utility/loop_scheduler.hpp | 1 - .../gpu/grouped_gemm.hpp | 185 ++++++++- ...evice_grouped_gemm_xdl_splitk_instance.hpp | 138 +++++++ .../gpu/grouped_gemm/CMakeLists.txt | 22 +- ..._bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp | 33 ++ ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp | 38 ++ ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp | 32 ++ ...bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp | 36 ++ ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp | 33 ++ ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 47 +-- ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 123 ------ ...itk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp | 32 ++ ...6_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp | 36 ++ ...itk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp | 33 ++ ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 51 +-- ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 55 +-- ...ultiply_bf16_i8_bf16_mk_kn_mn_instance.cpp | 234 ----------- .../profiler/profile_grouped_gemm_impl.hpp | 119 +++--- ...e_grouped_gemm_multiply_tile_loop_impl.hpp | 3 +- .../profile_grouped_gemm_tile_loop_impl.hpp | 2 +- .../profile_grouped_gemm_two_stage_impl.hpp | 367 ------------------ profiler/src/CMakeLists.txt | 1 - profiler/src/profile_grouped_gemm.cpp | 89 ++++- .../src/profile_grouped_gemm_fixed_nk.cpp | 8 +- .../src/profile_grouped_gemm_two_stage.cpp | 228 ----------- test/grouped_gemm/CMakeLists.txt | 6 - .../test_grouped_gemm_splitk_xdl.cpp | 42 +- .../test_grouped_gemm_ut_cases.inc | 133 +------ test/grouped_gemm/test_grouped_gemm_util.hpp | 139 +++---- 51 files changed, 1400 insertions(+), 1723 deletions(-) delete mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp delete mode 100644 profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp delete mode 100644 profiler/src/profile_grouped_gemm_two_stage.cpp diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp index 4b284c74d4..47d3e0abf9 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp @@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co constexpr ck::index_t NumDTensor = 2; using GroupedGemmKernelArgument = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector grouped_gemm_kernel_args_; grouped_gemm_kernel_args_.reserve(group_count); diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp index 6cc83e06f6..8c705d3bcc 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp @@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co constexpr ck::index_t NumDTensor = 1; using GroupedGemmKernelArgument = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector grouped_gemm_kernel_args_; grouped_gemm_kernel_args_.reserve(group_count); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index ecff7b4713..8bbf8e629e 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -246,7 +246,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co // do GEMM auto argument = gemm.MakeArgument( p_As, p_Bs, p_Ds, p_Cs, gemm_descs, a_element_op, b_element_op, cde_element_op); - gemm.SetKBatchSize(argument, config.k_batch); + gemm.SetKBatchSize(&argument, config.k_batch); if(!gemm.IsSupportedArgument(argument)) { throw std::runtime_error( @@ -257,7 +257,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer()); DeviceMem gemm_arg_dev_mem(gemm.GetDeviceKernelArgSize(&argument)); - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); invoker.Run(argument, StreamConfig{nullptr, false, 1}); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index 965a0e7e37..e7b2ee4173 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -91,7 +91,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co { auto group_count = problem_size.group_count; - using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument; using GemmDesc = ck::tensor_operation::device::GemmDesc; // GEMM shape diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp index a193fc39ba..3b3ef508ce 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -254,7 +254,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co gemm.GetDeviceKernelArgSize(&argument), hipMemcpyHostToDevice)); - gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_kernel_args_dev.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp index 1a2bcfb33e..c1043f419d 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -239,7 +239,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co "not support this GEMM problem"); } - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp index 0a63a29843..c81874b066 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -240,7 +240,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co "not support this GEMM problem"); } - gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer()); + gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer()); gemm.SetKBatch(argument, config.k_batch); invoker.Run(argument, StreamConfig{nullptr, false}); diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 320870e0de..7cb0588b82 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -168,9 +168,23 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co auto argument = gemm.MakeArgument( p_a, p_b, p_Ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op); - DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument)); + std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); + std::size_t kargs_size = gemm.GetDeviceKernelArgSize(&argument); - gemm.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer()); + DeviceMem gemm_workspace, gemm_kargs; + + // The following is necessary since TwoStage kernel is using additional memory both + // for Workspace and kernel arguments. + if(kargs_size > 0) + { + gemm_kargs.Realloc(kargs_size); + gemm.SetDeviceKernelArgs(&argument, gemm_kargs.GetDeviceBuffer()); + } + if(workspace_size > 0 && workspace_size != kargs_size) + { + gemm_workspace.Realloc(workspace_size); + gemm.SetWorkSpacePointer(&argument, gemm_workspace.GetDeviceBuffer()); + } if(!gemm.IsSupportedArgument(argument)) { diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp index 1e03405536..267a970ee5 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp @@ -1,17 +1,87 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include #include +#include +#include #include #include "device_base.hpp" +#include "ck/utility/ignore.hpp" namespace ck { namespace tensor_operation { namespace device { +/// +/// @brief Structure representing single GEMM problem arguments. +/// +/// The pointer to the vector of those structures is passed to the GroupedGEMM entry +/// point kernel. +/// +/// @tparam NumDTensor The number of D input tensors. +/// +template +struct GroupedGemmKernelArgument +{ + __host__ __device__ GroupedGemmKernelArgument(const void* p_a_grid_, + const void* p_b_grid_, + std::array p_ds_grid_, + void* p_e_grid_, + index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + std::array StrideDs_, + index_t StrideE_) + : p_a_grid{p_a_grid_}, + p_b_grid{p_b_grid_}, + p_ds_grid{p_ds_grid_}, + p_e_grid{p_e_grid_}, + M{M_}, + N{N_}, + K{K_}, + StrideA{StrideA_}, + StrideB{StrideB_}, + StrideDs{StrideDs_}, + StrideE{StrideE_} + { + } + + const void* p_a_grid; + const void* p_b_grid; + std::array p_ds_grid; + void* p_e_grid; + index_t M; + index_t N; + index_t K; + index_t StrideA; + index_t StrideB; + std::array StrideDs; + index_t StrideE; + + void Print() const + { + std::stringstream str; + for(auto sd : StrideDs) + str << sd << ","; + + std::cout << "arg {" + << "M:" << M << ", " + << "N:" << N << ", " + << "K:" << K << ", " + << "SA:" << StrideA << ", " + << "SB:" << StrideB << ", " + << "SE:" << StrideE << ", " + << "SDs: {" << str.str() << "}" + << "}" << std::endl; + } +}; + struct GemmDesc { ck::index_t M_, N_, K_; @@ -48,6 +118,66 @@ struct DeviceGroupedGemm : public BaseOperator CElementwiseOperation c_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; + + //--------------------------------------------------------------------------------------------- + /// @brief Sets the device kernel arguments pointer and may copy data to device. + /// + /// TODO: Add which kernels are using this (TileLoop * FixedNK ??) + /// + /// @param p_arg The pointer to the Argument we're going to update. + /// @param[in] p_dev_kernel_args The pointer to the device memory which will contain kernel + /// arguments. + /// @param[in] p_host_kernel_args The pointer to the host memory which contains kernel + /// arguments that should be copied to device memory. + /// + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const + { + ignore = p_arg; + ignore = p_dev_kernel_args; + ignore = p_host_kernel_args; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + //---------------------------------------------------------------------------------------------- + /// @brief Sets the device kernel arguments pointer and may copy data to device. + /// + /// @param p_arg The pointer to the Argument we're going to update. + /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel + /// arguments. + /// + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const + { + ignore = p_arg; + ignore = p_dev_kernel_args; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + //---------------------------------------------------------------------------------------------- + /// @brief Gets the device kernel argument size. + /// + /// @param[in] p_arg The pointer to the Device op Argument. + /// + /// @return The device kernel argument size. + /// + virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const + { + ignore = p_arg; + + std::ostringstream err; + err << "This function is not implemented by the kernel: " << this->GetTypeString() + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp index fcb2ba6a4d..780a0c30c5 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp @@ -1,35 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include - -#include "device_grouped_gemm.hpp" +#include "device_grouped_gemm_splitk.hpp" namespace ck { namespace tensor_operation { namespace device { -template -struct GroupedGemmKernelArgument -{ - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - - index_t M; - index_t N; - index_t K; - - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; -}; - template -struct DeviceGroupedGemmFixedNK : DeviceGroupedGemm +struct DeviceGroupedGemmFixedNK : DeviceGroupedGemmSplitK { - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const = 0; - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; - virtual void SetKBatch(BaseArgument* p_arg, index_t k_batch) const = 0; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp deleted file mode 100644 index d91eac0730..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include - -#include "device_grouped_gemm.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -/// -/// @brief Structure representing single GEMM problem arguments. -/// -/// The pointer to the vector of those structures is passed to the GroupedGEMM entry -/// point kernel. -/// -/// @tparam NumDTensor The number of D input tensors. -/// -template -struct GroupedGemmMultipleDKernelArguments -{ - __host__ __device__ - GroupedGemmMultipleDKernelArguments(const void* p_a_grid_, - const void* p_b_grid_, - std::array p_ds_grid_, - void* p_e_grid_, - index_t M_, - index_t N_, - index_t K_, - index_t StrideA_, - index_t StrideB_, - std::array StrideDs_, - index_t StrideE_) - : p_a_grid{p_a_grid_}, - p_b_grid{p_b_grid_}, - p_ds_grid{p_ds_grid_}, - p_e_grid{p_e_grid_}, - M{M_}, - N{N_}, - K{K_}, - StrideA{StrideA_}, - StrideB{StrideB_}, - StrideDs{StrideDs_}, - StrideE{StrideE_} - { - } - - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - index_t M; - index_t N; - index_t K; - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; - - void Print() const - { - std::stringstream str; - for(auto sd : StrideDs) - str << sd << ","; - - std::cout << "arg {" - << "M:" << M << ", " - << "N:" << N << ", " - << "K:" << K << ", " - << "SA:" << StrideA << ", " - << "SB:" << StrideB << ", " - << "SE:" << StrideE << ", " - << "SDs: {" << str.str() << "}" - << "}" << std::endl; - } -}; - -template -struct DeviceGroupedGemmMultipleDSplitK : public DeviceGroupedGemm -{ - //---------------------------------------------------------------------------------------------- - /// @brief Sets the k batch size. - /// - /// @param p_arg Pointer to the Argument we're going to change. - /// @param[in] kbatch The kbatch value. - /// - virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Sets the device kernel arguments pointer. - /// - /// @param p_arg The pointer to the Argument we're going to update. - /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel - /// arguments. - /// - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Gets the device kernel argument size. - /// - /// @param[in] p_arg The pointer to the Device op Argument. - /// - /// @return The device kernel argument size. - /// - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp index 06d180d30f..3ea6501902 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include #include "device_grouped_gemm.hpp" @@ -31,7 +31,23 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm { + //---------------------------------------------------------------------------------------------- + /// @brief Sets the k batch size. + /// + /// @param p_arg Pointer to the Argument we're going to change. + /// @param[in] kbatch The kbatch value. + /// virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0; + //---------------------------------------------------------------------------------------------- + /// @brief Sets the k batch size. + /// + /// @param p_arg Pointer to the Argument we're going to change. + /// @param[in] kbatch The kbatch value. + /// + virtual void SetKBatch(BaseArgument* p_arg, index_t kbatch) const + { + this->SetKBatchSize(p_arg, kbatch); + }; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp index c1030f31cc..712fbfd9e9 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp @@ -3,83 +3,20 @@ #pragma once -#include -#include -#include -#include - #include "device_grouped_gemm.hpp" namespace ck { namespace tensor_operation { namespace device { +/// @brief Grouped GEMM kernel using output Tile Looping algorithm /// -/// @brief Structure representing single GEMM problem arguments. +/// @par This kernel does not require any knowledge about input data sizes (GEMM M/N/K) +/// It requires only the number of groups to launch. Other information like +/// data pointers and GEMM sizes, packed into gemm kernel args may be all dynamic +/// (known only at kernel run-time). /// -/// The pointer to the vector of those structures is passed to the GroupedGEMM entry -/// point kernel. -/// -/// @tparam NumDTensor The number of D input tensors. -/// -template -struct GroupedGemmTileLoopKernelArguments -{ - __host__ __device__ - GroupedGemmTileLoopKernelArguments(const void* p_a_grid_, - const void* p_b_grid_, - std::array p_ds_grid_, - void* p_e_grid_, - index_t M_, - index_t N_, - index_t K_, - index_t StrideA_, - index_t StrideB_, - std::array StrideDs_, - index_t StrideE_) - : p_a_grid{p_a_grid_}, - p_b_grid{p_b_grid_}, - p_ds_grid{p_ds_grid_}, - p_e_grid{p_e_grid_}, - M{M_}, - N{N_}, - K{K_}, - StrideA{StrideA_}, - StrideB{StrideB_}, - StrideDs{StrideDs_}, - StrideE{StrideE_} - { - } - - const void* p_a_grid; - const void* p_b_grid; - std::array p_ds_grid; - void* p_e_grid; - index_t M; - index_t N; - index_t K; - index_t StrideA; - index_t StrideB; - std::array StrideDs; - index_t StrideE; - - void Print() const - { - std::stringstream str; - for(auto sd : StrideDs) - str << sd << ","; - - std::cout << "arg {" - << "M:" << M << ", " - << "N:" << N << ", " - << "K:" << K << ", " - << "SA:" << StrideA << ", " - << "SB:" << StrideB << ", " - << "SE:" << StrideE << ", " - << "SDs: {" << str.str() << "}" - << "}" << std::endl; - } -}; +/// @note This kernel does not support SplitK. template { - //---------------------------------------------------------------------------------------------- - /// @brief Sets the device kernel arguments pointer. - /// - /// @param p_arg The pointer to the Argument we're going to update. - /// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel - /// arguments. - /// - virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0; - - //---------------------------------------------------------------------------------------------- - /// @brief Gets the device kernel argument size. - /// - /// @param[in] p_arg The pointer to the Device op Argument. - /// - /// @return The device kernel argument size. - /// - virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp index 68c6dcc0f5..0535c80323 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp @@ -18,7 +18,6 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" @@ -78,17 +77,17 @@ template = false> struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage - : public DeviceGroupedGemmMultipleDSplitK + : public DeviceGroupedGemmSplitK { using DeviceOp = DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage; @@ -530,7 +529,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage index_t skipped_group_count_; index_t grid_size_; // Pointer to device memory with GEMM kernel arguments. - const void* p_dev_gemm_args_; + void* p_dev_gemm_kargs_; AElementwiseOperation a_element_op_; BElementwiseOperation b_element_op_; @@ -566,7 +565,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage /// @return The average kernel execution time (if time measurement is enabled.) /// float Run(const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_args, void* dev_gemm_workspace, const StreamConfig& stream_config = StreamConfig{}) { @@ -621,7 +620,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage /// float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(arg.p_dev_gemm_args_ == nullptr) + if(arg.p_dev_gemm_kargs_ == nullptr) { std::ostringstream err; err << "The gemm arguments device buffer is not allocated!" @@ -637,7 +636,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage throw std::runtime_error(err.str()); } - return Run(arg, arg.p_dev_gemm_args_, arg.p_workspace_, stream_config); + return Run(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config); } float Run(const BaseArgument* p_arg, @@ -723,7 +722,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage template float DispatchKernel(const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_kargs, void* dev_gemm_workspace, const StreamConfig& stream_config) const { @@ -746,7 +745,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage return LaunchKernel(gemm_kernel, elementwise_kernel, arg, - dev_gemm_args, + dev_gemm_kargs, dev_gemm_workspace, stream_config); } @@ -755,12 +754,19 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage float LaunchKernel(const KernelFunction& gemm_kernel, const KernelFunction2& elementwise_kernel, const Argument& arg, - const void* dev_gemm_args, + void* dev_gemm_kargs, [[maybe_unused]] void* dev_gemm_workspace, const StreamConfig& stream_config) const { float time{0.f}; + hip_check_error( + hipMemcpyWithStream(dev_gemm_kargs, + arg.gemm_kernel_args_.data(), + arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg), + hipMemcpyHostToDevice, + stream_config.stream_id_)); + auto preprocess = [&]() { hip_check_error(hipMemsetAsync( dev_gemm_workspace, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_)); @@ -774,7 +780,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage dim3(arg.grid_size_), dim3(BlockSize), 0, - cast_pointer_to_constant_address_space(dev_gemm_args), + cast_pointer_to_constant_address_space(dev_gemm_kargs), arg.gemm_kernel_args_.size(), arg.a_element_op_, arg.b_element_op_, @@ -930,18 +936,30 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage return str.str(); } - void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const - { - arg.p_dev_gemm_args_ = p_dev_kernel_args; - hip_check_error(hipMemcpy(p_dev_kernel_args, - arg.gemm_kernel_args_.data(), - GetDeviceKernelArgSize(&arg), - hipMemcpyHostToDevice)); - } - void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override { - return SetDeviceKernelArgs(*dynamic_cast(p_arg), p_dev_kernel_args); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->p_dev_gemm_kargs_ = p_dev_kernel_args; + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); + } + + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + { + auto arg = dynamic_cast(p_arg); + if(arg) + { + return arg->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override @@ -974,17 +992,22 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } - static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); } + [[deprecated]] static void SetKBatchSize(Argument& arg, index_t kbatch) + { + arg.UpdateKBatch(kbatch); + } void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override { - return SetKBatchSize(*dynamic_cast(p_arg), kbatch); - } - - size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override - { - return dynamic_cast(p_arg)->gemm_kernel_args_.size() * - sizeof(GemmTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + p_arg_->UpdateKBatch(kbatch); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp index 2884e558cd..f673713f3e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp @@ -20,7 +20,6 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp" // stare wywalic -#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" namespace ck { @@ -522,7 +521,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop ComputeTypeA, ComputeTypeB>; - using KernelArguments = GroupedGemmTileLoopKernelArguments; + using KernelArguments = GroupedGemmKernelArgument; using Block2ETileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>; using OffsettedLocalBlock2ETileMap = OffsettedBlockToCTileMap2; @@ -936,12 +935,31 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop return str.str(); } + void SetDeviceKernelArgs(Argument& arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const + { + arg.p_dev_gemm_args_ = p_dev_kernel_args; + hip_check_error(hipMemcpy(p_dev_kernel_args, + p_host_kernel_args, + GetDeviceKernelArgSize(&arg), + hipMemcpyHostToDevice)); + } + + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, + void* p_dev_kernel_args, + const void* p_host_kernel_args) const override + { + return SetDeviceKernelArgs( + *dynamic_cast(p_arg), p_dev_kernel_args, p_host_kernel_args); + } + void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const { arg.p_dev_gemm_args_ = p_dev_kernel_args; } - void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override { return SetDeviceKernelArgs(*dynamic_cast(p_arg), p_dev_kernel_args); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp index 658f323516..86cf1da156 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp @@ -1,6 +1,6 @@ #pragma once // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -717,7 +717,24 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm(p_arg)->group_count_ * sizeof(GemmBiasTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + return p_arg_->group_count_ * sizeof(GemmBiasTransKernelArg); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDXdlCShuffle::Argument structure!"); + } + + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + { + return GetWorkSpaceSize(p_arg); + } + + void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + { + return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp index ac05a0703f..1fee02bad8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp @@ -445,6 +445,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK; using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMapMLoops; + // TODO: replace with GroupedGemmKernelArgument struct GemmBiasTransKernelArg { // pointers @@ -900,40 +901,58 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK(p_arg), kernel_args); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->grouped_gemm_kernel_args_dev = kernel_args; + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override { - auto arg = *dynamic_cast(p_arg); - - return arg.group_count_ * arg.barrier_size_grp_ * sizeof(uint32_t); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + return arg_ptr->group_count_ * arg_ptr->barrier_size_grp_ * sizeof(uint32_t); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override { - auto arg = *dynamic_cast(p_arg); - - return arg.group_count_ * sizeof(GroupedGemmKernelArgument); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + return arg_ptr->group_count_ * sizeof(GroupedGemmKernelArgument); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace, const StreamConfig& stream_config = StreamConfig{}) const override { - auto p_arg_ = dynamic_cast(p_arg); - p_arg_->p_workspace_ = p_workspace; + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->p_workspace_ = p_workspace; + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); hip_check_error( - hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(p_arg), stream_config.stream_id_)); + hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(arg_ptr), stream_config.stream_id_)); } static void SetKBatch(Argument& arg, index_t k_batch) { arg.UpdateKBatch(k_batch); } @@ -941,7 +960,26 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK(p_arg), k_batch); + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->UpdateKBatch(k_batch); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); + } + + void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override + { + auto arg_ptr = dynamic_cast(p_arg); + if(arg_ptr) + { + arg_ptr->UpdateKBatch(kbatch); + } + else + throw std::runtime_error("The argument pointer is not an object of " + "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!"); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp index cb0afbb08d..626ffbe979 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp @@ -546,7 +546,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK(p_arg)->gemm_kernel_args_.size() * - sizeof(GemmTransKernelArg); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + return p_arg_->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!"); } + size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override + { + return GetWorkSpaceSize(p_arg); + } + + // TODO: deperecation notice. static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); } // polymorphic void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override { - return SetKBatchSize(*dynamic_cast(p_arg), kbatch); + auto p_arg_ = dynamic_cast(p_arg); + if(p_arg_) + { + p_arg_->UpdateKBatch(kbatch); + } + else + throw std::runtime_error( + "The argument pointer is not an object of " + "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!"); + } + + void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override + { + return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args); } }; diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp index 0c4d85bedb..a88109249d 100644 --- a/include/ck/utility/loop_scheduler.hpp +++ b/include/ck/utility/loop_scheduler.hpp @@ -5,7 +5,6 @@ #pragma once #include "ck/utility/common_header.hpp" -#include "ck/tensor_description/tensor_adaptor.hpp" namespace ck { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp index 87426fd52e..a999f9e3a0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp @@ -95,6 +95,45 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( PassThrough, PassThrough>>>& instances); +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances( std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + std::vector>>& instances); + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + std::vector>>& instances); + #endif #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8) @@ -262,7 +419,11 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances( op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + op_ptrs); + add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + op_ptrs); } } #endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp new file mode 100644 index 0000000000..7721e42c3c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/utility/loop_scheduler.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using Empty_Tuple = ck::Tuple<>; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto PipelineV1 = ck::PipelineVersion::v1; +static constexpr auto PipelineV2 = ck::PipelineVersion::v2; +static constexpr auto DefaultScheduler = ck::LoopScheduler::Default; +static constexpr auto InterwaveScheduler = ck::LoopScheduler::Interwave; +static constexpr auto GemmMNKPadding = device::GemmSpecialization::MNKPadding; +static constexpr auto GemmDefault = device::GemmSpecialization::Default; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_rrr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_rcr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +template = false> +using device_grouped_gemm_xdl_splitk_2Bt_crr_instances = std::tuple< + // clang-format off + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Pipeline | Loop | + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Version | Scheduler | + //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 2, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler>, + DeviceGroupedGemmXdlSplitKCShuffle< Col, Row, Empty_Tuple, Row, T, T, F32, T, Empty_Tuple, T, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, Pipeline, Scheduler> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt index de20321945..4a3e1a4ada 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt @@ -4,12 +4,30 @@ add_instance_library(device_grouped_gemm_instance device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp + + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp + + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp + device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp + device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instance.cpp + device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instance.cpp device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp new file mode 100644 index 0000000000..b8a03871cd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 0000000000..10141165ca --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp new file mode 100644 index 0000000000..b96f5983ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_crr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp new file mode 100644 index 0000000000..8fad42316e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 0000000000..7845136ca6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp new file mode 100644 index 0000000000..a2d79edf6b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp new file mode 100644 index 0000000000..033a2929f0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp new file mode 100644 index 0000000000..cf8c94bf46 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp new file mode 100644 index 0000000000..70c0d703ef --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp index 98e476f8bb..077a8a18ca 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -1,53 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -// a[m, k] * b[k, n] = e[m, n] -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( std::vector>>& instances) { - add_device_operation_instances(instances, - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{}); + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp deleted file mode 100644 index ed0a8c7b70..0000000000 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1>, - - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v1, LoopScheduler::Interwave>, - - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 24, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Row, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 1, 3, 2>, S<0, 1, 3, 2>, 2, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, PipelineVersion::v2> - // clang-format on - >; - -void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp new file mode 100644 index 0000000000..8ad4736ac4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp new file mode 100644 index 0000000000..1d968c8210 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp new file mode 100644 index 0000000000..ee3d7d73b8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_gemm_xdl_splitk_2Bt_rrr_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp index aa6365cd98..085e74f0ca 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -1,57 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -// a[m, k] * b[n, k] = e[m, n] -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( std::vector>>& instances) { - add_device_operation_instances(instances, - device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{}); + add_device_operation_instances( + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp index f4460b360b..320bb933b9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp @@ -1,63 +1,14 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using Empty_Tuple = ck::Tuple<>; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple< - // clang-format off - //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 192, 64, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 48, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 192, 32, 32, 8, 8, 32, 32, 3, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 192, 32, 8, 8, 32, 32, 1, 3, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 256, 32, 8, 8, 32, 32, 1, 4, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 32, 64, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 32, 32, 8, 8, 32, 32, 1, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 64, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 32, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, - DeviceGroupedGemmXdlSplitKCShuffle< Row, Col, Empty_Tuple, Row, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, S<1, 4, 16, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> - // clang-format on - >; - void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances( std::vector>>& instances) { add_device_operation_instances( - instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); + instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp deleted file mode 100644 index c98328e52d..0000000000 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp +++ /dev/null @@ -1,234 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using BF16 = ck::bhalf_t; -using I8 = int8_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using Multiply = ck::tensor_operation::element_wise::Multiply; -using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; -using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; -using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; - -static constexpr auto GemmDefault = GemmSpecialization::Default; -static constexpr auto GemmKPadding = GemmSpecialization::KPadding; -static constexpr auto GemmMNPadding = GemmSpecialization::MNPadding; -static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; - -static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; - -template -using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple< - // clang-format off - //###########################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //###########################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //###########################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 256, 256, 32, 8, 4, 32, 32, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 224, 256, 64, 8, 4, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 2, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 256, 32, 8, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>, - DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 128, 128, 64, 8, 4, 32, 32, 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8,8,1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1> - - // clang-format on - >; - -template -using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances = - std::tuple< - // clang-format off - //###########################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //###########################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //###########################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v1>, - // Memory friendly - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 64, 16, 16, 256, 8, 4, 16, 16, 1, 1, S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 4>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 32, 256, 8, 4, 16, 16, 1, 1, S<32, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<64, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 64, 128, 8, 4, 16, 16, 1, 2, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 32, 64, 128, 8, 4, 32, 32, 1, 1, S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<32, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 16, 128, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 128, 32, 128, 64, 8, 4, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 8>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 16, 256, 64, 8, 4, 16, 16, 1, 4, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, S<4,4,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2>, - // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop< Row, Row, DsLayout, Row, BF16, I8, F32, F32, DsDataType, BF16, PassThrough, PassThrough, CDEElementwiseOp, GemmSpec, 1, 256, 32, 256, 64, 8, 4, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 16, 4, 0, 1, 1, S<1, 16, 1, 16>, S<8,8,1>, BlkGemmPipeSched, BlockGemmPipelineVersion::v2> - // clang-format on - >; - -void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - Multiply>>>& instances) -{ - // comp - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmDefault>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmMNKPadding>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmMNPadding>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances, - ck::Tuple, - Multiply, - GemmKPadding>{}); - // mem - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmDefault, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNKPadding, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNPadding, - Intrawave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmKPadding, - Intrawave>{}); - - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmDefault, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNKPadding, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmMNPadding, - Interwave>{}); - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances, - ck::Tuple, - Multiply, - GemmKPadding, - Interwave>{}); -} - -void add_device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - MultiplyAddFastGelu>>>& instances) -{ - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances< - ck::Tuple, - ck::Tuple, - MultiplyAddFastGelu>{}); -} - -void add_device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instances( - std::vector, - Row, - BF16, - I8, - ck::Tuple, - BF16, - PassThrough, - PassThrough, - MultiplyFastGelu>>>& instances) -{ - add_device_operation_instances( - instances, - device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances< - ck::Tuple, - ck::Tuple, - MultiplyFastGelu>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index 0b73e4fcd1..c10cd0ea9f 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -17,7 +17,6 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" #include "ck/library/utility/fill.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" @@ -42,11 +41,14 @@ bool profile_grouped_gemm_impl(int do_verification, const std::vector& StrideAs, const std::vector& StrideBs, const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) + const std::vector& kbatches = {}, + int n_warmup = 1, + int n_iter = 10) { bool pass = true; + // TODO: Fixme - we do not pass compute data type here but need it + // to compute error thresholds. + using ComputeDataType = ADataType; auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { @@ -75,6 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification, std::vector> c_m_n_host_results; std::vector> c_m_n_device_results; + ComputeDataType max_abs_in_val = 0.f; for(std::size_t i = 0; i < group_count; i++) { a_m_k.push_back( @@ -93,17 +96,18 @@ bool profile_grouped_gemm_impl(int do_verification, << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << "]:" << c_m_n_device_results[i].mDesc << std::endl; } - std::size_t num_thread = 1; switch(init_method) { case 0: break; case 1: - a_m_k[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k[i]); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n[i]); + max_abs_in_val = 2.f; break; default: - a_m_k[i].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); + ck::utils::FillUniformDistribution{-0.5f, 0.5f}(a_m_k[i]); + ck::utils::FillUniformDistribution{-0.5f, 0.5f}(b_k_n[i]); + max_abs_in_val = 0.5f; } } @@ -164,7 +168,20 @@ bool profile_grouped_gemm_impl(int do_verification, BElementOp, CElementOp>; - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + // If kbatch would be bigger than 1, then we will use SplitK version. + using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK, + CLayout, + ADataType, + BDataType, + ck::Tuple<>, + CDataType, + AElementOp, + BElementOp, + CElementOp>; + + auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); if(op_ptrs.size() <= 0) @@ -205,7 +222,6 @@ bool profile_grouped_gemm_impl(int do_verification, ref_invoker.Run(ref_argument); } } - // profile device GEMM instances for(auto& gemm_ptr : op_ptrs) { @@ -221,43 +237,44 @@ bool profile_grouped_gemm_impl(int do_verification, auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); - DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get())); + std::size_t workspace_size = gemm_ptr->GetWorkSpaceSize(argument_ptr.get()); + std::size_t kargs_size = gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()); - gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer()); - std::string gemm_name = gemm_ptr->GetTypeString(); + DeviceMem gemm_workspace, gemm_kargs; - using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - // skip non-splitk grouped_gemm - if(dynamic_cast(gemm_ptr.get()) == nullptr) + // The following is necessary since TwoStage kernel is using additional memory both + // for Workspace and kernel arguments. + if(kargs_size > 0) { - continue; + gemm_kargs.Realloc(kargs_size); + gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(), gemm_kargs.GetDeviceBuffer()); } + if(workspace_size > 0 && workspace_size != kargs_size) + { + gemm_workspace.Realloc(workspace_size); + gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_workspace.GetDeviceBuffer()); + } + + std::string gemm_name = gemm_ptr->GetTypeString(); std::vector kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64}; - if(kbatch > 0) + // If the user will provide not empty kbatches list, then we test predefined set of kbatch + // values. + if(!kbatches.empty()) { - kbatch_list = {kbatch}; + kbatch_list = kbatches; } for(std::size_t j = 0; j < kbatch_list.size(); j++) { - auto kbatch_curr = kbatch_list[j]; - dynamic_cast(gemm_ptr.get()) - ->SetKBatchSize(argument_ptr.get(), kbatch_curr); + if(kbatch_curr > 1 && dynamic_cast(gemm_ptr.get()) != nullptr) + { + dynamic_cast(gemm_ptr.get()) + ->SetKBatchSize(argument_ptr.get(), kbatch_curr); + } if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { @@ -272,23 +289,18 @@ bool profile_grouped_gemm_impl(int do_verification, bool instance_pass = true; for(std::size_t i = 0; i < gemm_descs.size(); i++) { - c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); + auto atol = ck::utils::get_absolute_threshold( + max_abs_in_val, gemm_descs[i].K_); + auto rtol = ck::utils::get_relative_threshold( + gemm_descs[i].K_); - if(std::is_same_v && kbatch_curr > 1) - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i], - "Error: Incorrect results!", - 0.06); - } - else - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i]); - } + instance_pass = + instance_pass && ck::utils::check_err(c_m_n_device_results[i], + c_m_n_host_results[i], + "Error: Incorrect results!", + rtol, + atol); if(do_log) { @@ -311,11 +323,12 @@ bool profile_grouped_gemm_impl(int do_verification, pass = pass && instance_pass; } - float ave_time = invoker_ptr->Run( - argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); - if(time_kernel) { + float ave_time = + invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); + std::size_t flop = 0, num_btype = 0; for(std::size_t i = 0; i < gemm_descs.size(); i++) { diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp index f665644162..94ee2a37e4 100644 --- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp @@ -143,8 +143,7 @@ bool profile_grouped_gemm_multiply_tile_loop_impl(int do_verification, p_ds.reserve(group_count); p_e.reserve(group_count); - using KernelArguments = - ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument; std::vector gemm_descs; std::vector gemm_kargs; diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp index 74faf15be3..3a4ca24dda 100644 --- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp @@ -127,7 +127,7 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification, p_b.reserve(group_count); p_c.reserve(group_count); - using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<>; + using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<>; std::vector gemm_descs; std::vector gemm_kargs; diff --git a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp deleted file mode 100644 index 14df96d505..0000000000 --- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp +++ /dev/null @@ -1,367 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/convolution_parameter.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" -#include "ck/library/utility/fill.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -namespace ck { -namespace profiler { - -template -bool profile_grouped_gemm_two_stage_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - const std::vector& Ms, - const std::vector& Ns, - const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) -{ - bool pass = true; - - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - using namespace ck::literals; - - if(is_same::value) - { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); - } - else - { - return HostTensorDescriptor({row, col}, {1_uz, stride}); - } - }; - - std::size_t group_count = Ms.size(); - - if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() && - group_count == StrideBs.size() && group_count == StrideCs.size())) - { - throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n"); - } - - std::vector> a_m_k; - std::vector> b_k_n; - std::vector> c_m_n_host_results; - std::vector> c_m_n_device_results; - - for(std::size_t i = 0; i < group_count; i++) - { - a_m_k.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{}))); - b_k_n.push_back( - Tensor(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{}))); - - c_m_n_device_results.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - - c_m_n_host_results.push_back( - Tensor(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" - << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i - << "]:" << c_m_n_device_results[i].mDesc << std::endl; - } - std::size_t num_thread = 1; - switch(init_method) - { - case 0: break; - case 1: - a_m_k[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - a_m_k[i].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - b_k_n[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); - } - } - - using AElementOp = ck::tensor_operation::element_wise::PassThrough; - using BElementOp = ck::tensor_operation::element_wise::PassThrough; - using CElementOp = ck::tensor_operation::element_wise::PassThrough; - - const auto a_element_op = AElementOp{}; - const auto b_element_op = BElementOp{}; - const auto c_element_op = CElementOp{}; - - using DeviceMemPtr = std::unique_ptr; - std::vector a_device_buf, b_device_buf, c_device_buf; - - a_device_buf.reserve(group_count); - b_device_buf.reserve(group_count); - c_device_buf.reserve(group_count); - - std::vector p_a, p_b; - std::vector p_c; - - p_a.reserve(group_count); - p_b.reserve(group_count); - p_c.reserve(group_count); - - std::vector gemm_descs; - - gemm_descs.reserve(group_count); - - for(std::size_t i = 0; i < group_count; i++) - { - a_device_buf.emplace_back( - std::make_unique(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize())); - b_device_buf.emplace_back( - std::make_unique(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize())); - c_device_buf.emplace_back(std::make_unique( - sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize())); - - a_device_buf[i]->ToDevice(a_m_k[i].mData.data()); - b_device_buf[i]->ToDevice(b_k_n[i].mData.data()); - - gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}}); - - p_a.push_back(a_device_buf[i]->GetDeviceBuffer()); - p_b.push_back(b_device_buf[i]->GetDeviceBuffer()); - p_c.push_back(c_device_buf[i]->GetDeviceBuffer()); - } - - using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - if(op_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device GEMM instance found"); - } - - std::string best_gemm_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - float best_kbatch = 0; - - auto p_ds = std::vector>{}; - - if(do_verification) - { - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; - - auto ref_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = ref_gemm.MakeArgument(a_m_k[i], - b_k_n[i], - c_m_n_host_results[i], - a_element_op, - b_element_op, - c_element_op); - - ref_invoker.Run(ref_argument); - } - } - - // profile device GEMM instances - for(auto& gemm_ptr : op_ptrs) - { - auto argument_ptr = - gemm_ptr->MakeArgumentPointer(p_a, - p_b, - p_ds, - p_c, - gemm_descs, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}); - - auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); - - DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get())); - gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer()); - - std::string gemm_name = gemm_ptr->GetTypeString(); - - using DeviceOpSplitK = - ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitK, - CLayout, - ADataType, - BDataType, - ck::Tuple<>, - CDataType, - AElementOp, - BElementOp, - CElementOp>; - - // skip non-splitk grouped_gemm - if(dynamic_cast(gemm_ptr.get()) == nullptr) - { - continue; - } - - std::vector kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64}; - - if(kbatch > 0) - { - kbatch_list = {kbatch}; - } - - for(std::size_t j = 0; j < kbatch_list.size(); j++) - { - - auto kbatch_curr = kbatch_list[j]; - dynamic_cast(gemm_ptr.get()) - ->SetKBatchSize(argument_ptr.get(), kbatch_curr); - - DeviceMem gemm_arg_dev_mem(dynamic_cast(gemm_ptr.get()) - ->GetDeviceKernelArgSize(argument_ptr.get())); - dynamic_cast(gemm_ptr.get()) - ->SetDeviceKernelArgs(argument_ptr.get(), gemm_arg_dev_mem.GetDeviceBuffer()); - - if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) - { - gemm_desc_workspace.SetZero(); - for(std::size_t i = 0; i < gemm_descs.size(); i++) - c_device_buf[i]->SetZero(); - - invoker_ptr->Run(argument_ptr.get(), - StreamConfig{nullptr, false, 0, n_warmup, n_iter}); - if(do_verification) - { - bool instance_pass = true; - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); - if(std::is_same_v && kbatch_curr > 1) - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i], - "Error: Incorrect results!", - 0.06); - } - else - { - instance_pass = - instance_pass && ck::utils::check_err(c_m_n_device_results[i], - c_m_n_host_results[i]); - } - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a_m_k[i].mData, ",") - << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n[i].mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_device: ", c_m_n_device_results[i].mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_host : ", c_m_n_host_results[i].mData, ",") - << std::endl; - } - } - - std::cout << "Instance: " << gemm_name << " verification " - << (instance_pass ? "SUCCEED" : "FAILED") << std::endl; - - pass = pass && instance_pass; - } - float ave_time = invoker_ptr->Run( - argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter}); - if(time_kernel) - { - std::size_t flop = 0, num_btype = 0; - for(std::size_t i = 0; i < gemm_descs.size(); i++) - { - flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; - - num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + - sizeof(BDataType) * Ks[i] * Ns[i] + - sizeof(CDataType) * Ms[i] * Ns[i]; - } - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch " - << kbatch_curr << std::endl; - - if(tflops > best_tflops) - { - best_gemm_name = gemm_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - best_kbatch = kbatch_curr; - } - } - } - else - { - std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem" - << std::endl; - } - } - } - - if(time_kernel) - { - std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " - << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch - << std::endl; - } - - return pass; -} - -} // namespace profiler -} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index f079d554bf..35e91f8172 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -43,7 +43,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index fbf44d720f..2adcd6483a 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -39,16 +39,13 @@ namespace { std::vector argToIntArray(char* input) { std::vector out; - std::istringstream in(input); - std::string item; while(std::getline(in, item, ',')) { out.push_back(std::stoi(item)); } - return out; } @@ -69,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[]) << "arg7: time kernel (0=n0, 1=yes)\n" << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " "64,64 64,64 128,128)\n" - << "arg15: kbatch value (default 1)\n" + << "arg15: kbatch values (default 1)\n" << "optional:\n" << "arg16: number of warm-up cycles (default 1)\n" << "arg17: number of iterations (default 10)\n" @@ -92,7 +89,7 @@ int profile_grouped_gemm(int argc, char* argv[]) const auto StrideAs = argToIntArray(argv[11]); const auto StrideBs = argToIntArray(argv[12]); const auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; + const auto kbatches = argc >= 15 ? argToIntArray(argv[14]) : std::vector{}; int n_warmup = 1; int n_iter = 10; @@ -102,7 +99,6 @@ int profile_grouped_gemm(int argc, char* argv[]) n_iter = std::stoi(argv[16]); } -#ifdef CK_ENABLE_FP16 if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, + n_warmup, + n_iter); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) + { + ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, + n_warmup, + n_iter); + } + else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN) + { + ck::profiler::profile_grouped_gemm_impl(do_verification, + init_method, + do_log, + time_kernel, + Ms, + Ns, + Ks, + StrideAs, + StrideBs, + StrideCs, + kbatches, n_warmup, n_iter); } @@ -239,7 +301,6 @@ int profile_grouped_gemm(int argc, char* argv[]) { throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); } -#endif return 0; } diff --git a/profiler/src/profile_grouped_gemm_fixed_nk.cpp b/profiler/src/profile_grouped_gemm_fixed_nk.cpp index de90a33ef4..e33d798504 100644 --- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp +++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp @@ -32,9 +32,7 @@ namespace { std::vector argToIntArray(char* input) { std::vector out; - std::istringstream in(input); - std::string item; while(std::getline(in, item, ',')) @@ -83,7 +81,7 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) const auto StrideAs = argToIntArray(argv[11]); const auto StrideBs = argToIntArray(argv[12]); const auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; + const int kbatch = argc >= 15 ? std::stoi(argv[14]) : 1; using F32 = float; using F16 = ck::half_t; @@ -97,8 +95,8 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) int n_iter = 10; if(argc == 17) { - n_warmup = std::stoi(argv[16]); - n_iter = std::stoi(argv[17]); + n_warmup = std::stoi(argv[15]); + n_iter = std::stoi(argv[16]); } #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8) diff --git a/profiler/src/profile_grouped_gemm_two_stage.cpp b/profiler/src/profile_grouped_gemm_two_stage.cpp deleted file mode 100644 index db37a0b762..0000000000 --- a/profiler/src/profile_grouped_gemm_two_stage.cpp +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_grouped_gemm_two_stage_impl.hpp" -#include "profiler_operation_registry.hpp" - -enum struct GemmMatrixLayout -{ - MK_KN_MN, // 0 - MK_NK_MN, // 1 -}; - -enum struct GemmDataType -{ - F16_F16_F16, // 0 - BF16_INT8_BF16, // 1 - BF16_BF16_BF16 // 2 -}; - -#define OP_NAME "grouped_gemm_two_stage" -#define OP_DESC "Grouped GEMM TwoStage" - -namespace { - -std::vector argToIntArray(char* input) -{ - std::vector out; - - std::istringstream in(input); - - std::string item; - - while(std::getline(in, item, ',')) - { - out.push_back(std::stoi(item)); - } - - return out; -} - -int profile_grouped_gemm_two_stage(int argc, char* argv[]) -{ - if(argc < 14) - { - std::cout - << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" - << "arg2: data type (0: fp16; 1: bf16@int8; 2: bf16)\n" - << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]);\n" - << "arg4: verification (0: no; 1: yes)\n" - << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n" - << "arg6: print tensor value (0: no; 1: yes)\n" - << "arg7: time kernel (0=n0, 1=yes)\n" - << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " - "64,64 64,64 128,128)\n" - << "arg15: kbatch value (default 1)\n" - << "optional:\n" - << "arg16: number of warm-up cycles (default 1)\n" - << "arg17: number of iterations (default 10)\n" - << std::endl; - - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[4]); - const int init_method = std::stoi(argv[5]); - const bool do_log = std::stoi(argv[6]); - const bool time_kernel = std::stoi(argv[7]); - - const auto Ms = argToIntArray(argv[8]); - const auto Ns = argToIntArray(argv[9]); - const auto Ks = argToIntArray(argv[10]); - - auto StrideAs = argToIntArray(argv[11]); - auto StrideBs = argToIntArray(argv[12]); - auto StrideCs = argToIntArray(argv[13]); - const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; - - const int DefaultStrideA = Ks[0]; - const int DefaultStrideB = Ns[0]; - const int DefaultStrideC = Ns[0]; - - for(size_t i = 0; i < Ms.size(); ++i) - { - StrideAs[i] = StrideAs[i] == -1 ? DefaultStrideA : StrideAs[i]; - StrideBs[i] = StrideBs[i] == -1 ? DefaultStrideB : StrideBs[i]; - StrideCs[i] = StrideCs[i] == -1 ? DefaultStrideC : StrideCs[i]; - } - - int n_warmup = 1; - int n_iter = 10; - if(argc == 17) - { - n_warmup = std::stoi(argv[16]); - n_iter = std::stoi(argv[17]); - } - - if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) - { - ck::profiler::profile_grouped_gemm_two_stage_impl( - do_verification, - init_method, - do_log, - time_kernel, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); - } - else - { - throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); - } - return 0; -} - -} // anonymous namespace - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_two_stage); diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt index 55cb209772..f47685cf91 100644 --- a/test/grouped_gemm/CMakeLists.txt +++ b/test/grouped_gemm/CMakeLists.txt @@ -6,12 +6,6 @@ if(result EQUAL 0) add_dependencies(test_grouped_gemm test_grouped_gemm_splitk) endif() -add_gtest_executable(test_grouped_gemm_two_stage_splitk test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp) -if(result EQUAL 0) - target_link_libraries(test_grouped_gemm_two_stage_splitk PRIVATE utility device_grouped_gemm_instance) - add_dependencies(test_grouped_gemm test_grouped_gemm_two_stage_splitk) -endif() - add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp) if(result EQUAL 0) target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance) diff --git a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp index d9282fa924..74d49eb576 100644 --- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp +++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -10,25 +10,35 @@ #include "gtest/gtest.h" #include "test_grouped_gemm_util.hpp" -using F16 = ck::half_t; +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F8 = ck::f8_t; +using I8 = int8_t; + using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; -using RRR_F16_F16_F16 = ck::test::TestGroupedGemm>; -using RCR_F16_F16_F16 = ck::test::TestGroupedGemm>; +template +class TestGroupedGemm : public ck::test::TestGroupedGemm +{ +}; -using RRR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm>; -using RCR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm>; +// clang-format off +using KernelTypes = ::testing::Types< + std::tuple< Row, Row, Row, F16, F16, F16>, + std::tuple< Row, Col, Row, F16, F16, F16>, + std::tuple< Col, Row, Row, F16, F16, F16>, + std::tuple< Col, Col, Row, F16, F16, F16>, + std::tuple< Row, Row, Row, BF16, BF16, BF16>, + std::tuple< Row, Col, Row, BF16, BF16, BF16>, + std::tuple< Col, Row, Row, BF16, BF16, BF16>, + std::tuple< Row, Row, Row, BF16, I8, BF16>, + std::tuple< Row, Col, Row, BF16, I8, BF16>, + std::tuple< Row, Row, Row, F16, F8, F16>, + std::tuple< Row, Row, Row, F8, F16, F16> + >; +// clang-format on -const std::vector KBATCH{1, 2, 3, 5, 8}; - -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_KN, RRR_F16_F16_F16, testing::ValuesIn(KBATCH)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_NK, RCR_F16_F16_F16, testing::ValuesIn(KBATCH)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_KN, - RRR_F16_F16_F16_LargeK, - testing::Values(32, 64)); -INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_NK, - RCR_F16_F16_F16_LargeK, - testing::Values(32, 64)); +TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes); #include "test_grouped_gemm_ut_cases.inc" diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc index d94d140d97..f4011cf998 100644 --- a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc +++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc @@ -1,6 +1,6 @@ #pragma once -TEST_P(RRR_F16_F16_F16, TinyCases) +TYPED_TEST(TestGroupedGemm, TinyCases) { const std::vector Ms{0, 1}; constexpr int N = 768; @@ -8,14 +8,11 @@ TEST_P(RRR_F16_F16_F16, TinyCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, SmallCases) +TYPED_TEST(TestGroupedGemm, SmallCases) { const std::vector Ms{2, 1, 3, 4, 5, 0}; constexpr int N = 768; @@ -23,14 +20,11 @@ TEST_P(RRR_F16_F16_F16, SmallCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, MidCases) +TYPED_TEST(TestGroupedGemm, MidCases) { const std::vector Ms{167, 183, 177, 153, 139, 204}; constexpr int N = 768; @@ -38,14 +32,11 @@ TEST_P(RRR_F16_F16_F16, MidCases) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, Regular) +TYPED_TEST(TestGroupedGemm, Regular) { const std::vector Ms{64, 128, 256}; constexpr int N = 768; @@ -53,14 +44,11 @@ TEST_P(RRR_F16_F16_F16, Regular) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RRR_F16_F16_F16, MNKPadded) +TYPED_TEST(TestGroupedGemm, MNKPadded) { const std::vector Ms{127, 150, 188, 210}; constexpr int N = 136; @@ -68,88 +56,11 @@ TEST_P(RRR_F16_F16_F16, MNKPadded) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->Run(Ms, Ns, Ks); } -TEST_P(RCR_F16_F16_F16, TinyCases) -{ - const std::vector Ms{0, 1}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, SmallCases) -{ - const std::vector Ms{2, 1, 3, 4, 5, 0}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, MidCases) -{ - const std::vector Ms{167, 183, 177, 153, 139, 204}; - constexpr int N = 768; - constexpr int K = 544; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, Regular) -{ - const std::vector Ms{32, 64, 128, 256}; - constexpr int N = 768; - constexpr int K = 320; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16, MNKPadded) -{ - const std::vector Ms{127, 150, 188, 210}; - constexpr int N = 136; - constexpr int K = 280; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch) +TYPED_TEST(TestGroupedGemm, TestLargeKBatch) { const std::vector Ms{188, 210}; constexpr int N = 768; @@ -157,24 +68,8 @@ TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch) const std::vector Ns(Ms.size(), N); const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), N); - const std::vector StrideCs(Ms.size(), N); - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); -} - -TEST_P(RCR_F16_F16_F16_LargeK, TestLargeKBatch) -{ - const std::vector Ms{188, 210}; - constexpr int N = 768; - constexpr int K = 4096; - - const std::vector Ns(Ms.size(), N); - const std::vector Ks(Ms.size(), K); - const std::vector StrideAs(Ms.size(), K); - const std::vector StrideBs(Ms.size(), K); - const std::vector StrideCs(Ms.size(), N); - - this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam()); + this->k_batches_ = {32, 64}; + + this->Run(Ms, Ns, Ks); } diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp index 9e1395b9f8..a3ab0e087c 100644 --- a/test/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/grouped_gemm/test_grouped_gemm_util.hpp @@ -22,7 +22,6 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/number.hpp" #include "profiler/profile_grouped_gemm_impl.hpp" -#include "profiler/profile_grouped_gemm_two_stage_impl.hpp" namespace ck { namespace test { @@ -40,7 +39,7 @@ std::string serialize_range(const Range& range) } template -class TestGroupedGemm : public testing::TestWithParam +class TestGroupedGemm : public testing::Test { protected: using ALayout = std::tuple_element_t<0, Tuple>; @@ -50,23 +49,77 @@ class TestGroupedGemm : public testing::TestWithParam using BDataType = std::tuple_element_t<4, Tuple>; using EDataType = std::tuple_element_t<5, Tuple>; + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + public: static constexpr bool verify_ = true; - static constexpr int init_method_ = 1; // decimal value initialization + static constexpr int init_method_ = 1; // integer value initialization static constexpr bool log_ = false; static constexpr bool bench_ = false; // measure kernel performance + static constexpr int n_warmup_ = 0; + static constexpr int n_iter_ = 1; + std::vector k_batches_; - void SetUp() override {} + void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; } + private: + template + void SetStrides(std::vector& strides, + const std::vector& rows, + const std::vector& cols) const + { + if(std::is_same_v) + { + for(const auto c : cols) + { + strides.emplace_back(c); + } + } + else if(std::is_same_v) + { + for(const auto r : rows) + { + strides.emplace_back(r); + } + } + } + + public: void Run(const std::vector& Ms, const std::vector& Ns, const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) + const std::vector& StrideAs = {}, + const std::vector& StrideBs = {}, + const std::vector& StrideCs = {}) + { + std::vector stride_as = StrideAs; + std::vector stride_bs = StrideBs; + std::vector stride_cs = StrideCs; + + if(stride_as.empty()) + { + SetStrides(stride_as, Ms, Ks); + } + if(stride_bs.empty()) + { + SetStrides(stride_bs, Ks, Ns); + } + if(stride_cs.empty()) + { + SetStrides(stride_cs, Ms, Ns); + } + + RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_); + } + + void RunSingle(const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + const std::vector& StrideAs, + const std::vector& StrideBs, + const std::vector& StrideCs, + const std::vector& kbatches) { bool pass = ck::profiler::profile_grouped_gemm_impl StrideAs, StrideBs, StrideCs, - kbatch, - n_warmup, - n_iter); - EXPECT_TRUE(pass); - } -}; - -template -class TestGroupedGemmTwoStage : public testing::TestWithParam -{ - protected: - using ALayout = std::tuple_element_t<0, Tuple>; - using BLayout = std::tuple_element_t<1, Tuple>; - using ELayout = std::tuple_element_t<2, Tuple>; - using ADataType = std::tuple_element_t<3, Tuple>; - using BDataType = std::tuple_element_t<4, Tuple>; - using EDataType = std::tuple_element_t<5, Tuple>; - - public: - static constexpr bool verify_ = true; - static constexpr int init_method_ = 1; // decimal value initialization - static constexpr bool log_ = false; - static constexpr bool bench_ = false; // measure kernel performance - - void SetUp() override {} - - void Run(const std::vector& Ms, - const std::vector& Ns, - const std::vector& Ks, - const std::vector& StrideAs, - const std::vector& StrideBs, - const std::vector& StrideCs, - int kbatch = 1, - int n_warmup = 1, - int n_iter = 10) - { - bool pass = ck::profiler::profile_grouped_gemm_two_stage_impl(verify_, - init_method_, - log_, - bench_, - Ms, - Ns, - Ks, - StrideAs, - StrideBs, - StrideCs, - kbatch, - n_warmup, - n_iter); + kbatches, + n_warmup_, + n_iter_); EXPECT_TRUE(pass); } }; @@ -263,7 +264,7 @@ struct DeviceGroupedGemmSplitkInstanceWrapper p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}); if(kbatch > 1) { - ggemm_instance.SetKBatchSize(argument, kbatch); + ggemm_instance.SetKBatchSize(&argument, kbatch); } return ggemm_instance.IsSupportedArgument(argument); @@ -300,13 +301,13 @@ struct DeviceGroupedGemmSplitkInstanceWrapper p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}); if(kbatch > 1) { - ggemm_instance.SetKBatchSize(argument, kbatch); + ggemm_instance.SetKBatchSize(&argument, kbatch); } EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument)); auto invoker = ggemm_instance.MakeInvoker(); - DeviceMem gemm_desc_workspace(ggemm_instance.GetWorkSpaceSize(&argument)); - ggemm_instance.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer()); + DeviceMem dev_gemm_kargs(ggemm_instance.GetDeviceKernelArgSize(&argument)); + ggemm_instance.SetDeviceKernelArgs(&argument, dev_gemm_kargs.GetDeviceBuffer()); return invoker.Run(argument, StreamConfig{nullptr, false}); } }; From fe6b185b97e9f9875ef470884e9f9fba17be02d5 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 27 Nov 2024 06:12:56 -0800 Subject: [PATCH 22/52] move utility headers from library/include to include path (#1697) --- codegen/CMakeLists.txt | 1 + {library/include => include}/ck/library/utility/algorithm.hpp | 0 {library/include => include}/ck/library/utility/check_err.hpp | 0 {library/include => include}/ck/library/utility/conv_common.hpp | 0 .../utility/convolution_host_tensor_descriptor_helper.hpp | 0 .../ck/library/utility/convolution_parameter.hpp | 0 .../include => include}/ck/library/utility/device_memory.hpp | 0 {library/include => include}/ck/library/utility/fill.hpp | 0 .../include => include}/ck/library/utility/host_common_util.hpp | 0 {library/include => include}/ck/library/utility/host_gemm.hpp | 0 {library/include => include}/ck/library/utility/host_tensor.hpp | 0 .../ck/library/utility/host_tensor_generator.hpp | 0 {library/include => include}/ck/library/utility/iterator.hpp | 0 {library/include => include}/ck/library/utility/literals.hpp | 0 {library/include => include}/ck/library/utility/numeric.hpp | 0 {library/include => include}/ck/library/utility/ranges.hpp | 0 16 files changed, 1 insertion(+) rename {library/include => include}/ck/library/utility/algorithm.hpp (100%) rename {library/include => include}/ck/library/utility/check_err.hpp (100%) rename {library/include => include}/ck/library/utility/conv_common.hpp (100%) rename {library/include => include}/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp (100%) rename {library/include => include}/ck/library/utility/convolution_parameter.hpp (100%) rename {library/include => include}/ck/library/utility/device_memory.hpp (100%) rename {library/include => include}/ck/library/utility/fill.hpp (100%) rename {library/include => include}/ck/library/utility/host_common_util.hpp (100%) rename {library/include => include}/ck/library/utility/host_gemm.hpp (100%) rename {library/include => include}/ck/library/utility/host_tensor.hpp (100%) rename {library/include => include}/ck/library/utility/host_tensor_generator.hpp (100%) rename {library/include => include}/ck/library/utility/iterator.hpp (100%) rename {library/include => include}/ck/library/utility/literals.hpp (100%) rename {library/include => include}/ck/library/utility/numeric.hpp (100%) rename {library/include => include}/ck/library/utility/ranges.hpp (100%) diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt index 1ca0d12821..45c47672b0 100644 --- a/codegen/CMakeLists.txt +++ b/codegen/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) +configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h) find_package(ROCM) include(ROCMInstallTargets) diff --git a/library/include/ck/library/utility/algorithm.hpp b/include/ck/library/utility/algorithm.hpp similarity index 100% rename from library/include/ck/library/utility/algorithm.hpp rename to include/ck/library/utility/algorithm.hpp diff --git a/library/include/ck/library/utility/check_err.hpp b/include/ck/library/utility/check_err.hpp similarity index 100% rename from library/include/ck/library/utility/check_err.hpp rename to include/ck/library/utility/check_err.hpp diff --git a/library/include/ck/library/utility/conv_common.hpp b/include/ck/library/utility/conv_common.hpp similarity index 100% rename from library/include/ck/library/utility/conv_common.hpp rename to include/ck/library/utility/conv_common.hpp diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp rename to include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/include/ck/library/utility/convolution_parameter.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_parameter.hpp rename to include/ck/library/utility/convolution_parameter.hpp diff --git a/library/include/ck/library/utility/device_memory.hpp b/include/ck/library/utility/device_memory.hpp similarity index 100% rename from library/include/ck/library/utility/device_memory.hpp rename to include/ck/library/utility/device_memory.hpp diff --git a/library/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp similarity index 100% rename from library/include/ck/library/utility/fill.hpp rename to include/ck/library/utility/fill.hpp diff --git a/library/include/ck/library/utility/host_common_util.hpp b/include/ck/library/utility/host_common_util.hpp similarity index 100% rename from library/include/ck/library/utility/host_common_util.hpp rename to include/ck/library/utility/host_common_util.hpp diff --git a/library/include/ck/library/utility/host_gemm.hpp b/include/ck/library/utility/host_gemm.hpp similarity index 100% rename from library/include/ck/library/utility/host_gemm.hpp rename to include/ck/library/utility/host_gemm.hpp diff --git a/library/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor.hpp rename to include/ck/library/utility/host_tensor.hpp diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor_generator.hpp rename to include/ck/library/utility/host_tensor_generator.hpp diff --git a/library/include/ck/library/utility/iterator.hpp b/include/ck/library/utility/iterator.hpp similarity index 100% rename from library/include/ck/library/utility/iterator.hpp rename to include/ck/library/utility/iterator.hpp diff --git a/library/include/ck/library/utility/literals.hpp b/include/ck/library/utility/literals.hpp similarity index 100% rename from library/include/ck/library/utility/literals.hpp rename to include/ck/library/utility/literals.hpp diff --git a/library/include/ck/library/utility/numeric.hpp b/include/ck/library/utility/numeric.hpp similarity index 100% rename from library/include/ck/library/utility/numeric.hpp rename to include/ck/library/utility/numeric.hpp diff --git a/library/include/ck/library/utility/ranges.hpp b/include/ck/library/utility/ranges.hpp similarity index 100% rename from library/include/ck/library/utility/ranges.hpp rename to include/ck/library/utility/ranges.hpp From e7b6286441aae59d3a87db67f42369d3cc2636a4 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 27 Nov 2024 18:25:07 +0100 Subject: [PATCH 23/52] Add interwave scheduler for gemm mem pipeline (#1647) * add interwave scheduler for gemm mem pipeline * Fix merge artifacts. * Refactor unit tests. * Switch to interwave scheduler for mem example --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski --- example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 3 +- example/ck_tile/03_gemm/run_gemm_example.inc | 3 +- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 224 ++++++++++++++++++ test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 19 +- .../gemm/test_gemm_mem_pipeline_ut_cases.inc | 59 ++++- .../gemm/test_gemm_mem_pipeline_util.hpp | 25 +- 6 files changed, 311 insertions(+), 22 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp index 97d150412d..cd9d9d96b6 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp @@ -30,7 +30,6 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - #else // Compute friendly for Intrawave scheduler constexpr ck_tile::index_t M_Tile = 256; @@ -84,7 +83,7 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) AccDataType, GemmShape, Traits, - ck_tile::GemmPipelineScheduler::Intrawave, + ck_tile::GemmPipelineScheduler::Interwave, has_hot_loop_v, tail_number_v>>; using Kernel = ck_tile::GemmKernel; diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 5199c1e3ef..a1fc155775 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -200,7 +200,8 @@ int run_gemm_example(int argc, char* argv[]) return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); } // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not - // work. else if(a_layout == "C" && b_layout == "C") + // work. + // else if(a_layout == "C" && b_layout == "C") // { // return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); // } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 4634e9dcb9..847c5b187d 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -322,6 +322,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_sync_lds(); + LocalPrefill(a_copy_lds_window, a_block_tiles.get(number{}), a_element_func); @@ -374,6 +375,229 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem } }; + template <> + struct PipelineImpl + { + template + CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, + SrcTileWindow& dram_tile_window) const + { + load_tile(dst_block_tile, dram_tile_window); + move_tile_window(dram_tile_window, {0, KPerBlock}); + } + + template + CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, + const SrcBlockTile& src_block_tile, + const ElementFunction& element_func) const + { + const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); + store_tile(lds_tile_window, block_tile_tmp); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + static_assert( + std::is_same_v> && + std::is_same_v>, + "A/B Dram block window should have the same data type as appropriate " + "([A|B]DataType) defined in Problem definition!"); + + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + NPerBlock == + BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" + " or KPerBlock!"); + + // ------------------------------------------------------------------------------------ + // Definitions of all needed tiles + + // A tile in LDS + ADataType* p_a_lds = static_cast(p_smem); + constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); + auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); + + // TODO: LDS alignment should come from Policy! + constexpr index_t a_lds_block_space_size_aligned = + integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), + 16) * + 16; + + // B tile in LDS + BDataType* p_b_lds = static_cast( + static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); + constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); + auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + + // A DRAM tile window for load + auto a_copy_dram_window = + make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_dram_block_window_tmp.get_window_origin(), + Policy::template MakeADramTileDistribution()); + + // A LDS tile window for store + auto a_copy_lds_window = + make_tile_window(a_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + a_copy_dram_window.get_tile_distribution()); + // B DRAM tile window for load + auto b_copy_dram_window = + make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_dram_block_window_tmp.get_window_origin(), + Policy::template MakeBDramTileDistribution()); + + // B LDS tile window for store + auto b_copy_lds_window = + make_tile_window(b_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + b_copy_dram_window.get_tile_distribution()); + + // A LDS tile for block GEMM + auto a_lds_gemm_window = make_tile_window( + a_lds_block, make_tuple(number{}, number{}), {0, 0}); + // B LDS tile for block GEMM + auto b_lds_gemm_window = make_tile_window( + b_lds_block, make_tuple(number{}, number{}), {0, 0}); + + // Block GEMM + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); + + using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); + using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); + + using ABlockTile = + decltype(make_static_distributed_tensor(ABlockTileDistr{})); + using BBlockTile = + decltype(make_static_distributed_tensor(BBlockTileDistr{})); + + tuple_array a_block_tiles; + tuple_array b_block_tiles; + + // ----------------------------------------------------------------------------------------- + // Gemm pipeline start + + // prefetch + // global read 0 + GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + + // initialize C + tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); + + // LDS write 0 + LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + + // Global prefetch [1, PrefetchStages] + static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { + GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + }); + + // main body + if constexpr(HasHotLoop) + { + index_t i = 0; + do + { + static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + // no second block_sync_lds because it's interwave + + LocalPrefill( + a_copy_lds_window, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + a_element_func); + LocalPrefill( + b_copy_lds_window, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + b_element_func); + + GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); + }); + + i += PrefetchStages; + } while(i < (num_loop - PrefetchStages)); + } + + auto HotLoopTail = [&](auto tail_num) { + static_for<1, tail_num, 1>{}([&](auto prefetch_idx) { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + // no second block_sync_lds because it's interwave + + LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); + }); + + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + }; + + if constexpr(TailNum == TailNumber::One) + { + block_sync_lds(); + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + } + else if constexpr(TailNum == TailNumber::Two) + { + HotLoopTail(number<2>{}); + } + else if constexpr(TailNum == TailNumber::Three) + { + HotLoopTail(number<3>{}); + } + else if constexpr(TailNum == TailNumber::Four) + { + HotLoopTail(number<4>{}); + } + else if constexpr(TailNum == TailNumber::Five) + { + HotLoopTail(number<5>{}); + } + else if constexpr(TailNum == TailNumber::Six) + { + HotLoopTail(number<6>{}); + } + else if constexpr(TailNum == TailNumber::Seven) + { + HotLoopTail(number<7>{}); + } + else if constexpr(TailNum == TailNumber::Full) + { + HotLoopTail(number{}); + } + + return c_block_tile; + } + }; + template +class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline +{ +}; + +template +class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline +{ +}; // clang-format off using KernelTypes = ::testing::Types< @@ -24,6 +36,7 @@ using KernelTypes = ::testing::Types< >; // clang-format on -TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes); #include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc index b26114f39d..6b914e7975 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc @@ -1,6 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once -TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) +//------------------------------------------------------------------------------------------------ +// INTERWAVE SCHEDULER +//------------------------------------------------------------------------------------------------ + +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) { std::vector Ms{1, 2, 3, 4, 5, 6}; constexpr int N = 1024; @@ -10,7 +17,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) { std::vector Ms{127, 255, 312, 799, 1573}; constexpr int N = 1024; @@ -20,7 +27,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) { std::vector Ms{127}; constexpr int N = 1024; @@ -30,7 +37,51 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipeline, Regular) +TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) +{ + std::vector Ms{512}; + constexpr int N = 1024; + constexpr int K = 512; + + for(int M : Ms) + this->Run(M, N, K); +} + +//------------------------------------------------------------------------------------------------ +// INTRAWAVE SCHEDULER +//------------------------------------------------------------------------------------------------ + +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM) +{ + std::vector Ms{1, 2, 3, 4, 5, 6}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM) +{ + std::vector Ms{127, 255, 312, 799, 1573}; + constexpr int N = 1024; + constexpr int K = 320; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK) +{ + std::vector Ms{127}; + constexpr int N = 1024; + constexpr int K = 432; + + for(int M : Ms) + this->Run(M, N, K); +} + +TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular) { std::vector Ms{512}; constexpr int N = 1024; diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp index 6b47898339..15f9f516ee 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -11,20 +11,21 @@ #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -template +template class TestCkTileGemmMemPipeline : public ::testing::Test { protected: - using ALayout = std::tuple_element_t<0, Tuple>; - using BLayout = std::tuple_element_t<1, Tuple>; - using CLayout = std::tuple_element_t<2, Tuple>; - using ADataType = std::tuple_element_t<3, Tuple>; - using BDataType = std::tuple_element_t<4, Tuple>; - using AccDataType = std::tuple_element_t<5, Tuple>; - using CDataType = std::tuple_element_t<6, Tuple>; + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + static constexpr auto Scheduler = Scheduler_; // TODO: expose tile size through test t-param ? - struct gemm_basic_args + struct gemm_args { const void* p_a; const void* p_b; @@ -38,7 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test ck_tile::index_t stride_C; }; - void invoke_gemm(const gemm_basic_args& args, const ck_tile::stream_config& s) + void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s) { // TODO: This should be parameterized in tests constexpr ck_tile::index_t M_Tile = 128; @@ -89,7 +90,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test AccDataType, GemmShape, Traits, - ck_tile::GemmPipelineScheduler::Intrawave, + Scheduler, has_hot_loop_v, tail_number_v>>; using Kernel = ck_tile::GemmKernel; @@ -288,7 +289,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); - gemm_basic_args args; + gemm_args args; args.p_a = a_m_k_dev_buf.GetDeviceBuffer(); args.p_b = b_k_n_dev_buf.GetDeviceBuffer(); args.p_c = c_m_n_dev_buf.GetDeviceBuffer(); From f49b595dc02f3a40b61455c6914e8456b5f42f41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 28 Nov 2024 17:51:49 +0100 Subject: [PATCH 24/52] [CK TILE] Add gemm compute pipeline v3 (#1661) * [CK TILE] Add gemm compute pipeline v3 * Enable universal gemm compute pipeline. * Rename example and add compute pipeline. * Introduce ag bg cr pipeline impl base. * Refactor to reuse code. * Cleaning * Formatting. --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski --- example/ck_tile/03_gemm/CMakeLists.txt | 2 +- ...mm_mem_pipeline.cpp => universal_gemm.cpp} | 25 +- include/ck_tile/ops/gemm.hpp | 2 + .../block/block_universal_gemm_as_bs_cr.hpp | 223 +++++----- .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp | 111 +++++ .../gemm_pipeline_ag_bg_cr_comp_v3.hpp | 383 ++++++++++++++++++ .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 270 ++++-------- 7 files changed, 714 insertions(+), 302 deletions(-) rename example/ck_tile/03_gemm/{gemm_mem_pipeline.cpp => universal_gemm.cpp} (89%) create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index 8ae46cadc6..d166eed458 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) -add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp) +add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp) diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp similarity index 89% rename from example/ck_tile/03_gemm/gemm_mem_pipeline.cpp rename to example/ck_tile/03_gemm/universal_gemm.cpp index cd9d9d96b6..eaafc13b98 100644 --- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -14,10 +14,17 @@ #include "ck_tile/host.hpp" #include "gemm_basic.hpp" +#define CK_TILE_PIPELINE_COMPUTE 1 +#define CK_TILE_PIPELINE_MEMORY 2 + +#ifndef CK_TILE_PIPELINE_DEFAULT +#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE +#endif + template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { -#if 1 +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) // Memory friendly for Interwave scheduler constexpr ck_tile::index_t M_Tile = 128; constexpr ck_tile::index_t N_Tile = 32; @@ -30,7 +37,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; -#else + +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) // Compute friendly for Intrawave scheduler constexpr ck_tile::index_t M_Tile = 256; constexpr ck_tile::index_t N_Tile = 256; @@ -63,8 +71,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) ck_tile::Default2DEpilogueProblem>; using Traits = ck_tile::TileGemmTraits; - +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem< +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) + using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3< +#endif ck_tile::GemmPipelineProblem>; const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(args.K); @@ -77,13 +88,21 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) constexpr bool has_hot_loop_v = has_hot_loop_.value; constexpr auto tail_number_v = tail_number_.value; +#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem< +#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE) + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3< +#endif ck_tile::UniversalGemmPipelineProblem>; using Kernel = ck_tile::GemmKernel; diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 9a033ee2de..1340fb2048 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,8 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp" diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp index 5f98a7a0ba..c9e648f437 100644 --- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -41,13 +41,16 @@ struct BlockUniversalGemmAsBsCr static constexpr index_t MWarp = config.template at<1>(); static constexpr index_t NWarp = config.template at<2>(); - static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}), + using I0 = number<0>; + using I1 = number<1>; + + static_assert(MWarp == BlockGemmShape::BlockWarps::at(I0{}), "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!"); - static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}), + static_assert(NWarp == BlockGemmShape::BlockWarps::at(I1{}), "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!"); - static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}), + static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(I0{}), "Error! WarpGemm's M is not consisten with BlockGemmShape!"); - static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}), + static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(I1{}), "Error! WarpGemm's N is not consisten with BlockGemmShape!"); static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM); @@ -99,6 +102,9 @@ struct BlockUniversalGemmAsBsCr static constexpr auto Scheduler = Traits::Scheduler; + using I0 = number<0>; + using I1 = number<1>; + private: template struct BlockGemmImpl @@ -114,35 +120,31 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -156,16 +158,15 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -179,10 +180,10 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; @@ -193,7 +194,7 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; @@ -203,8 +204,8 @@ struct BlockUniversalGemmAsBsCr }); }); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); @@ -212,10 +213,10 @@ struct BlockUniversalGemmAsBsCr // hot loop: static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter)); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter)); // read C warp tensor from C block tensor- @@ -226,7 +227,7 @@ struct BlockUniversalGemmAsBsCr merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); + WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( @@ -243,13 +244,13 @@ struct BlockUniversalGemmAsBsCr struct BlockGemmImpl { statically_indexed_array< - statically_indexed_array, - GemmTraits::MIterPerWarp> + statically_indexed_array, + MIterPerWarp> a_warp_tiles_; statically_indexed_array< - statically_indexed_array, - GemmTraits::NIterPerWarp> + statically_indexed_array, + NIterPerWarp> b_warp_tiles_; template @@ -257,30 +258,27 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + make_tuple(number{}, number{}), + a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -292,18 +290,16 @@ struct BlockUniversalGemmAsBsCr AWarpWindow{}.get_window_lengths(), "AWarpWindow lengths must be equal to AWarpTile lengths!"); - statically_indexed_array< - statically_indexed_array, - GemmTraits::MIterPerWarp> + statically_indexed_array, + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + make_tuple(number{}, number{}), + b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -315,13 +311,12 @@ struct BlockUniversalGemmAsBsCr BWarpWindow{}.get_window_lengths(), "BWarpWindow lengths must be equal to BWarpTile lengths!"); - statically_indexed_array< - statically_indexed_array, - GemmTraits::NIterPerWarp> + statically_indexed_array, + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; // TODO: I don't have to move 0,0 window! @@ -331,8 +326,8 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; move_tile_window(b_warp_windows(nIter)(kIter), @@ -341,12 +336,12 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block window load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read B warp tensor from B Block window load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); }); @@ -359,22 +354,21 @@ struct BlockUniversalGemmAsBsCr [[maybe_unused]] const ASmemBlockWindow& a_block_window, [[maybe_unused]] const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; // hot loop: - static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, KIterPerWarp, 1>{}([&](auto kIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read C warp tensor from C block tensor- CWarpTensor c_warp_tensor; @@ -383,9 +377,9 @@ struct BlockUniversalGemmAsBsCr merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, - a_warp_tiles_[mIter][kIter], - b_warp_tiles_[nIter][kIter]); + WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kIter], + b_warp_tiles_[nIter][kIter]); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( @@ -412,12 +406,12 @@ struct BlockUniversalGemmAsBsCr statically_indexed_array< statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_tiles_; statically_indexed_array< statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_tiles_; template @@ -425,30 +419,28 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { static_assert( - GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] && - GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}], + GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] && + GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}], "MPerBlock, NPerBlock, KPerBlock defined in " " BlockGemmShape are different from A/B block smem windows apropriate dims!"); - static_assert(std::is_same_v && - std::is_same_v, + static_assert(std::is_same_v && + std::is_same_v, "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); - const index_t iMWarp = get_warp_id() / GemmTraits::NWarp; - const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp); + const index_t iMWarp = get_warp_id() / NWarp; + const index_t iNWarp = get_warp_id() - (iMWarp * NWarp); // TODO: refactor warp_window tile type to class member as it should be // compile-time known information. auto a_warp_window_tmp = make_tile_window( a_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), a_block_window.get_window_origin() + - multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{})); + multi_index<2>{iMWarp * WarpGemm::kM, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{})); using AWarpWindow = remove_cvref_t; @@ -461,16 +453,16 @@ struct BlockUniversalGemmAsBsCr "AWarpWindow lengths must be equal to AWarpTile lengths!"); statically_indexed_array, - GemmTraits::MIterPerWarp> + MIterPerWarp> a_warp_windows; // construct B-warp-window auto b_warp_window_tmp = make_tile_window( b_block_window.get_bottom_tensor_view(), - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), b_block_window.get_window_origin() + - multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop}, - make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{})); + multi_index<2>{iNWarp * WarpGemm::kN, KIdx * KPerInnerLoop}, + make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{})); using BWarpWindow = remove_cvref_t; @@ -483,10 +475,10 @@ struct BlockUniversalGemmAsBsCr "BWarpWindow lengths must be equal to BWarpTile lengths!"); statically_indexed_array, - GemmTraits::NIterPerWarp> + NIterPerWarp> b_warp_windows; - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { a_warp_windows(mIter)(kIter) = a_warp_window_tmp; @@ -496,7 +488,7 @@ struct BlockUniversalGemmAsBsCr }); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { b_warp_windows(nIter)(kIter) = b_warp_window_tmp; @@ -508,11 +500,11 @@ struct BlockUniversalGemmAsBsCr // TODO check if a_warp_tiles has same desc as a_warp_window static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block window load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter)); }); - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read B warp tensor from B Block window load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter)); }); @@ -525,13 +517,12 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - static_assert( - std::is_same_v, - "The CDataType as defined in traits should be the same as correspoinding " - "C block tensor data type!"); + static_assert(std::is_same_v, + "The CDataType as defined in traits should be the same as correspoinding " + "C block tensor data type!"); - using CWarpDstr = typename GemmTraits::WarpGemm::CWarpDstr; - using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor; + using CWarpDstr = typename WarpGemm::CWarpDstr; + using CWarpTensor = typename WarpGemm::CWarpTensor; constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); @@ -555,8 +546,8 @@ struct BlockUniversalGemmAsBsCr } static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) { - static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) { - static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) { + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { + static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // read C warp tensor from C block tensor- CWarpTensor c_warp_tensor; @@ -573,17 +564,17 @@ struct BlockUniversalGemmAsBsCr // penalty if constexpr(kIter.value == KRepeat - 1 && kInnerIter.value == KInnerLoopIter - 1 && - mIter.value == GemmTraits::MIterPerWarp - 1 && - nIter.value == GemmTraits::NIterPerWarp - 1) + mIter.value == MIterPerWarp - 1 && + nIter.value == NIterPerWarp - 1) { __builtin_amdgcn_sched_barrier(0); block_sync_lds(); __builtin_amdgcn_sched_barrier(0); } // warp GEMM - typename GemmTraits::WarpGemm{}(c_warp_tensor, - a_warp_tiles_[mIter][kInnerIter], - b_warp_tiles_[nIter][kInnerIter]); + WarpGemm{}(c_warp_tensor, + a_warp_tiles_[mIter][kInnerIter], + b_warp_tiles_[nIter][kInnerIter]); // write C warp tensor into C block tensor c_block_tensor.set_y_sliced_thread_data( diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp new file mode 100644 index 0000000000..431534af15 --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct GemmPipelineAgBgCrImplBase +{ + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + template + CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, + SrcTileWindow& dram_tile_window) const + { + load_tile(dst_block_tile, dram_tile_window); + move_tile_window(dram_tile_window, {0, KPerBlock}); + } + + template + CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, + const SrcBlockTile& src_block_tile, + const ElementFunction& element_func) const + { + const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); + store_tile(lds_tile_window, block_tile_tmp); + } + + CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const + { + // A tile in LDS + ADataType* p_a_lds = static_cast(p_smem); + constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); + auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); + + // TODO: LDS alignment should come from Policy! + constexpr index_t a_lds_block_space_size_aligned = + integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) * + 16; + + // B tile in LDS + BDataType* p_b_lds = static_cast( + static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); + constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); + auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + + return make_tuple(std::move(a_lds_block), std::move(b_lds_block)); + } + + template + CK_TILE_DEVICE auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const ALdsTensorView& a_lds_block_view) const + { + // A DRAM tile window for load + auto a_copy_dram_window = + make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + a_dram_block_window_tmp.get_window_origin(), + Policy::template MakeADramTileDistribution()); + + // A LDS tile window for store + auto a_copy_lds_window = + make_tile_window(a_lds_block_view, + make_tuple(number{}, number{}), + {0, 0}, + a_copy_dram_window.get_tile_distribution()); + + auto a_lds_gemm_window = make_tile_window( + a_lds_block_view, make_tuple(number{}, number{}), {0, 0}); + + return make_tuple(std::move(a_copy_dram_window), + std::move(a_copy_lds_window), + std::move(a_lds_gemm_window)); + } + + template + CK_TILE_DEVICE auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BLdsTensorView& b_lds_block_view) const + { + auto b_copy_dram_window = + make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), + make_tuple(number{}, number{}), + b_dram_block_window_tmp.get_window_origin(), + Policy::template MakeBDramTileDistribution()); + + // B LDS tile window for store + auto b_copy_lds_window = + make_tile_window(b_lds_block_view, + make_tuple(number{}, number{}), + {0, 0}, + b_copy_dram_window.get_tile_distribution()); + + auto b_lds_gemm_window = make_tile_window( + b_lds_block_view, make_tuple(number{}, number{}), {0, 0}); + + return make_tuple(std::move(b_copy_dram_window), + std::move(b_copy_lds_window), + std::move(b_lds_gemm_window)); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp new file mode 100644 index 0000000000..a72728b4a0 --- /dev/null +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" + +namespace ck_tile { + +// A Tile Window: global memory +// B Tile Window: global memory +// C Distributed tensor: register +template +struct BaseGemmPipelineAgBgCrCompV3 +{ + static constexpr index_t PrefetchStages = 2; + static constexpr index_t PrefillStages = 1; + static constexpr index_t GlobalBufferNum = 1; + + CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } + + CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) + { + ignore = num_loop; + return TailNumber::Full; + } +}; + +// Compute optimized pipeline +// GlobalPrefetchStages: 2 +// LocalPreFillStages: 1 +// LocalPreFetchStages: 1 +// LocalSharedMemoryBuffer: 1 +template +struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 +{ + using Base = BaseGemmPipelineAgBgCrCompV3; + using PipelineImplBase = GemmPipelineAgBgCrImplBase; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; + + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + + using BlockGemm = remove_cvref_t())>; + using I0 = number<0>; + using I1 = number<1>; + using I2 = number<2>; + + static constexpr index_t BlockSize = Problem::kBlockSize; + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + static constexpr index_t VectorSizeA = Problem::VectorSizeA; + static constexpr index_t VectorSizeB = Problem::VectorSizeB; + static constexpr index_t VectorSizeC = Problem::VectorSizeC; + + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr bool kPadK = Problem::kPadK; + + // Where is the right place for HasHotLoop and TailNum ??? + static constexpr bool HasHotLoop = Problem::HasHotLoop; + static constexpr auto TailNum = Problem::TailNum; + static constexpr auto Scheduler = Problem::Scheduler; + + using Base::PrefetchStages; + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return Policy::template GetSmemSize(); + } + + template + struct PipelineImpl : public PipelineImplBase + { + }; + + template <> + struct PipelineImpl : public PipelineImplBase + { + using Base = PipelineImplBase; + + CK_TILE_DEVICE static constexpr auto HotLoopScheduler() + { + constexpr index_t MPerXDL = BlockGemmShape::WarpTile::at(I0{}); + constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{}); + constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{}); + + constexpr index_t WaveSize = 64; + constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{}); + constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{}); + + constexpr index_t A_LDS_Read_Width = KPerXDL; + constexpr index_t B_LDS_Read_Width = KPerXDL; + + constexpr index_t A_Buffer_Load_Inst_Num = + MPerBlock * KPerBlock / (BlockSize * VectorSizeA); + constexpr index_t B_Buffer_Load_Inst_Num = + NPerBlock * KPerBlock / (BlockSize * VectorSizeB); + + constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL); + constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL); + + constexpr index_t A_LDS_Read_Inst_Num = + WaveNumN * MPerBlock * KPerBlock / (BlockSize * KPerXDL); + constexpr index_t B_LDS_Read_Inst_Num = + WaveNumM * MPerBlock * KPerBlock / (BlockSize * KPerXDL); + + constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock / + (BlockSize / WaveSize) / + (MPerXDL * NPerXDL * KPerXDL); + + // A/B split schedule + // compiler is likely to use ds_read2 when instruction width smaller than 16bytes + constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16 + ? A_LDS_Read_Inst_Num + : A_LDS_Read_Inst_Num / 2; + constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16 + ? B_LDS_Read_Inst_Num + : B_LDS_Read_Inst_Num / 2; + + constexpr auto num_ds_write_inst_a = A_LDS_Write_Inst_Num; + constexpr auto num_ds_write_inst_b = B_LDS_Write_Inst_Num; + + constexpr auto num_buffer_load_inst_a = A_Buffer_Load_Inst_Num; + constexpr auto num_buffer_load_inst_b = B_Buffer_Load_Inst_Num; + + constexpr auto num_mfma_inst = C_MFMA_Inst_Num; + + constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32; + constexpr auto ds_read_a_issue_cycle = + A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4; + constexpr auto ds_read_b_issue_cycle = + B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4; + constexpr auto ds_read_a_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle); + constexpr auto ds_read_b_mfma_rate = + (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle); + + constexpr auto num_dsread_a_mfma = + (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate; + constexpr auto num_dsread_b_mfma = + (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate; + + // stage 1 + // Separate this part? + // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) > + // sizeof(ComputeDataType) / + // sizeof(BDataType) + // ? sizeof(ComputeDataType) / + // sizeof(ADataType) : sizeof(ComputeDataType) + // / sizeof(BDataType); + constexpr auto num_mfma_stage1 = + num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma); + constexpr auto num_mfma_per_issue = + num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b); + constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a; + constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b; + + static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA + }); + static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) { + ignore = i; + static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { + ignore = idswrite; + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + __builtin_amdgcn_sched_group_barrier( + 0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA + }); + + // stage 2 + static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >= + ds_read_a_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier( + 0x100, + num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + + static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) { + if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >= + ds_read_b_mfma_rate) + { + __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read + } + else + { + __builtin_amdgcn_sched_group_barrier( + 0x100, + num_ds_read_inst_b - (num_dsread_b_mfma - 1) * ds_read_b_mfma_rate, + 0); // DS read + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + }); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + static_assert( + std::is_same_v> && + std::is_same_v>, + "A/B Dram block window should have the same data type as appropriate " + "([A|B]DataType) defined in Problem definition!"); + + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], + "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" + " or KPerBlock!"); + + // ------------------------------------------------------------------------------------ + // Definitions of all needed tiles + + // A/B tiles in LDS + auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + + // A DRAM tile window for load + // A LDS tile window for store + // A LDS tile for block GEMM + auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] = + Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + + // B DRAM tile window for load + // B LDS tile window for store + // B LDS tile for block GEMM + auto&& [b_copy_dram_window, b_copy_lds_window, b_lds_gemm_window] = + Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + + // Block GEMM + auto block_gemm = BlockGemm(); + auto c_block_tile = block_gemm.MakeCBlockTile(); + + using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution()); + using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution()); + + using ABlockTile = + decltype(make_static_distributed_tensor(ABlockTileDistr{})); + using BBlockTile = + decltype(make_static_distributed_tensor(BBlockTileDistr{})); + + ABlockTile a_block_tile; + BBlockTile b_block_tile; + + // ----------------------------------------------------------------------------------------- + // Gemm pipeline start + + // prefetch + // global read 0 + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + // initialize C + tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); + + // LDS write 0 + Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func); + + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + block_sync_lds(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); + + __builtin_amdgcn_sched_barrier(0); + + // main body + if constexpr(HasHotLoop) + { + index_t i = 0; + do + { + block_sync_lds(); + + Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func); + + Base::GlobalPrefetch(a_block_tile, a_copy_dram_window); + Base::GlobalPrefetch(b_block_tile, b_copy_dram_window); + + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + + block_sync_lds(); + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); + HotLoopScheduler(); + __builtin_amdgcn_sched_barrier(0); + + i += 1; + } while(i < (num_loop - 1)); + } + // tail + if constexpr(TailNum == TailNumber::Full) + { + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + } + // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle + // latency + // __builtin_amdgcn_sched_barrier(0); + return c_block_tile; + } + }; + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const AElementFunction& a_element_func, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + const BElementFunction& b_element_func, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + a_element_func, + b_dram_block_window_tmp, + b_element_func, + num_loop, + p_smem); + } + + template + CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, + const BDramBlockWindowTmp& b_dram_block_window_tmp, + index_t num_loop, + void* p_smem) const + { + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + [](const ADataType& a) { return a; }, + b_dram_block_window_tmp, + [](const BDataType& b) { return b; }, + num_loop, + p_smem); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 847c5b187d..e2e94cf92b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -6,6 +6,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" namespace ck_tile { @@ -90,7 +91,8 @@ struct BaseGemmPipelineAgBgCrMem template struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem { - using Base = BaseGemmPipelineAgBgCrMem; + using Base = BaseGemmPipelineAgBgCrMem; + using PipelineImplBase = GemmPipelineAgBgCrImplBase; using ADataType = remove_cvref_t; using BDataType = remove_cvref_t; @@ -103,8 +105,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem using BlockGemm = remove_cvref_t())>; using I0 = number<0>; + using I1 = number<1>; + using I2 = number<2>; - static constexpr index_t BlockSize = Problem::kBlockSize; static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; @@ -124,46 +127,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem using Base::PrefetchStages; - CK_TILE_HOST_DEVICE constexpr index_t GetStaticLdsSize() - { - return integer_divide_ceil( - sizeof(ADataType) * - Policy::template MakeALdsBlockDescriptor().get_element_space_size(), - 16) * - 16 + - sizeof(BDataType) * - Policy::template MakeBLdsBlockDescriptor().get_element_space_size(); - } - CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); } template - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { }; template <> - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { - template - CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, - SrcTileWindow& dram_tile_window) const - { - load_tile(dst_block_tile, dram_tile_window); - move_tile_window(dram_tile_window, {0, KPerBlock}); - } - - template - CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, - const SrcBlockTile& src_block_tile, - const ElementFunction& element_func) const - { - const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); - store_tile(lds_tile_window, block_tile_tmp); - } + using Base = PipelineImplBase; template "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - NPerBlock == - BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" " or KPerBlock!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles - // A tile in LDS - ADataType* p_a_lds = static_cast(p_smem); - constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); - auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); - - // TODO: LDS alignment should come from Policy! - constexpr index_t a_lds_block_space_size_aligned = - integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), - 16) * - 16; - - // B tile in LDS - BDataType* p_b_lds = static_cast( - static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); - constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); - auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + // A/B tiles in LDS + // With c++20 could simplify to below line. + // Currently get error: captured structured bindings are a C++20 extension + // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem); + auto& a_lds_block = ab_lds_blocks.at(I0{}); + auto& b_lds_block = ab_lds_blocks.at(I1{}); // A DRAM tile window for load - auto a_copy_dram_window = - make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_dram_block_window_tmp.get_window_origin(), - Policy::template MakeADramTileDistribution()); - // A LDS tile window for store - auto a_copy_lds_window = - make_tile_window(a_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - a_copy_dram_window.get_tile_distribution()); - // B DRAM tile window for load - auto b_copy_dram_window = - make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_dram_block_window_tmp.get_window_origin(), - Policy::template MakeBDramTileDistribution()); - - // B LDS tile window for store - auto b_copy_lds_window = - make_tile_window(b_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - b_copy_dram_window.get_tile_distribution()); - // A LDS tile for block GEMM - auto a_lds_gemm_window = make_tile_window( - a_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto a_windows = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + auto& a_copy_dram_window = a_windows.at(I0{}); + auto& a_copy_lds_window = a_windows.at(I1{}); + auto& a_lds_gemm_window = a_windows.at(I2{}); + + // B DRAM tile window for load + // B LDS tile window for store // B LDS tile for block GEMM - auto b_lds_gemm_window = make_tile_window( - b_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto b_windows = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + auto& b_copy_dram_window = b_windows.at(I0{}); + auto& b_copy_lds_window = b_windows.at(I1{}); + auto& b_lds_gemm_window = b_windows.at(I2{}); // Block GEMM auto block_gemm = BlockGemm(); @@ -266,20 +215,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); }); // main body @@ -295,19 +244,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - LocalPrefill( + Base::LocalPrefill( a_copy_lds_window, a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), a_element_func); - LocalPrefill( + Base::LocalPrefill( b_copy_lds_window, b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), b_element_func); - GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); }); i += PrefetchStages; @@ -323,12 +272,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); }); block_sync_lds(); @@ -376,24 +325,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem }; template <> - struct PipelineImpl + struct PipelineImpl : public PipelineImplBase { - template - CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, - SrcTileWindow& dram_tile_window) const - { - load_tile(dst_block_tile, dram_tile_window); - move_tile_window(dram_tile_window, {0, KPerBlock}); - } - - template - CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window, - const SrcBlockTile& src_block_tile, - const ElementFunction& element_func) const - { - const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile); - store_tile(lds_tile_window, block_tile_tmp); - } + using Base = PipelineImplBase; template "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - NPerBlock == - BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}], + static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" " or KPerBlock!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles - // A tile in LDS - ADataType* p_a_lds = static_cast(p_smem); - constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor(); - auto a_lds_block = make_tensor_view(p_a_lds, a_lds_block_desc); - - // TODO: LDS alignment should come from Policy! - constexpr index_t a_lds_block_space_size_aligned = - integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), - 16) * - 16; - - // B tile in LDS - BDataType* p_b_lds = static_cast( - static_cast(static_cast(p_smem) + a_lds_block_space_size_aligned)); - constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor(); - auto b_lds_block = make_tensor_view(p_b_lds, b_lds_block_desc); + // A/B tiles in LDS + // With c++20 could simplify to below line. + // Currently get error: captured structured bindings are a C++20 extension + // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem); + auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem); + auto& a_lds_block = ab_lds_blocks.at(I0{}); + auto& b_lds_block = ab_lds_blocks.at(I1{}); // A DRAM tile window for load - auto a_copy_dram_window = - make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - a_dram_block_window_tmp.get_window_origin(), - Policy::template MakeADramTileDistribution()); - // A LDS tile window for store - auto a_copy_lds_window = - make_tile_window(a_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - a_copy_dram_window.get_tile_distribution()); - // B DRAM tile window for load - auto b_copy_dram_window = - make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(), - make_tuple(number{}, number{}), - b_dram_block_window_tmp.get_window_origin(), - Policy::template MakeBDramTileDistribution()); - - // B LDS tile window for store - auto b_copy_lds_window = - make_tile_window(b_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - b_copy_dram_window.get_tile_distribution()); - // A LDS tile for block GEMM - auto a_lds_gemm_window = make_tile_window( - a_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto a_windows = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block); + auto& a_copy_dram_window = a_windows.at(I0{}); + auto& a_copy_lds_window = a_windows.at(I1{}); + auto& a_lds_gemm_window = a_windows.at(I2{}); + + // B DRAM tile window for load + // B LDS tile window for store // B LDS tile for block GEMM - auto b_lds_gemm_window = make_tile_window( - b_lds_block, make_tuple(number{}, number{}), {0, 0}); + auto b_windows = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block); + auto& b_copy_dram_window = b_windows.at(I0{}); + auto& b_copy_lds_window = b_windows.at(I1{}); + auto& b_lds_gemm_window = b_windows.at(I2{}); // Block GEMM auto block_gemm = BlockGemm(); @@ -496,20 +402,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); }); // main body @@ -523,19 +429,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - LocalPrefill( + Base::LocalPrefill( a_copy_lds_window, a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), a_element_func); - LocalPrefill( + Base::LocalPrefill( b_copy_lds_window, b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), b_element_func); - GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); - GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window); }); i += PrefetchStages; @@ -548,12 +454,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); }); block_sync_lds(); From aa6e2087f550be335e7b14893ee615303eec3faa Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:42:19 -0800 Subject: [PATCH 25/52] Reduce docker size and build time in CI. (#1699) * refactor docker build in CI * add Dockerfile.compiler * add input args to Dockerfile.compiler * rearrange the docker args --- Dockerfile | 4 ---- Dockerfile.compiler | 26 ++++++++++++++++++++++++++ Jenkinsfile | 45 +++++++++++++++++++++++++-------------------- 3 files changed, 51 insertions(+), 24 deletions(-) create mode 100644 Dockerfile.compiler diff --git a/Dockerfile b/Dockerfile index 38a563ce33..f9b7d76e3b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- # Remove unnecessary rocm components that take a lot of space apt-get remove -y rocblas rocfft rocsparse composablekernel-dev -# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1 -RUN if [ "$ROCMVERSION" = "6.1" ]; then \ - sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \ - fi # Update the cmake to version 3.27.5 RUN pip install --upgrade cmake==3.27.5 && \ #Install latest ccache diff --git a/Dockerfile.compiler b/Dockerfile.compiler new file mode 100644 index 0000000000..354b71f692 --- /dev/null +++ b/Dockerfile.compiler @@ -0,0 +1,26 @@ +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" +FROM $BASE_DOCKER +ARG compiler_version="" +ARG compiler_commit="" + +# Add alternative compilers, if necessary +ENV compiler_version=$compiler_version +ENV compiler_commit=$compiler_commit +RUN sh -c "echo compiler version = '$compiler_version'" && \ + sh -c "echo compiler commit = '$compiler_commit'" + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 16 ; \ + else echo "using the release compiler"; \ + fi + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 16 ; \ + else echo "using the release compiler"; \ + fi diff --git a/Jenkinsfile b/Jenkinsfile index b448a5130b..f8493fa2f6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -32,41 +32,42 @@ def runShell(String command){ return (output != "") } -def getDockerImageName(){ +def getBaseDockerImageName(){ def img if (params.USE_CUSTOM_DOCKER != ""){ img = "${params.USE_CUSTOM_DOCKER}" } else{ if (params.ROCMVERSION != "6.3"){ - if (params.COMPILER_VERSION == "") { - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" - } - else{ - if (params.COMPILER_COMMIT == ""){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" - } - else{ - def commit = "${params.COMPILER_COMMIT}"[0..6] - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" - } - } + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + else{ + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + } + return img +} + +def getDockerImageName(){ + def img + def base_name = getBaseDockerImageName() + if (params.USE_CUSTOM_DOCKER != ""){ + img = "${params.USE_CUSTOM_DOCKER}" } else{ if (params.COMPILER_VERSION == "") { - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${base_name}" } else{ if (params.COMPILER_COMMIT == ""){ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + img = "${base_name}_${params.COMPILER_VERSION}" } else{ def commit = "${params.COMPILER_COMMIT}"[0..6] - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + img = "${base_name}_${params.COMPILER_VERSION}_${commit}" } } } - } return img } @@ -131,17 +132,21 @@ def buildDocker(install_prefix){ env.DOCKER_BUILDKIT=1 checkout scm def image_name = getDockerImageName() + def base_image_name = getBaseDockerImageName() echo "Building Docker for ${image_name}" - def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " + def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ - dockerArgs = dockerArgs + " --no-cache " + dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . " + } + else{ + dockerArgs = dockerArgs + " -f Dockerfile . " } echo "Build Args: ${dockerArgs}" try{ if(params.BUILD_DOCKER){ //force building the new docker if that parameter is true echo "Building image: ${image_name}" - retimage = docker.build("${image_name}", dockerArgs + ' .') + retimage = docker.build("${image_name}", dockerArgs) withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) { retimage.push() } From bb652696e765fe178404bd38a071d6d6b829bccb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:43:36 -0800 Subject: [PATCH 26/52] Bump rocm-docs-core from 1.9.0 to 1.9.1 in /docs/sphinx (#1701) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.0 to 1.9.1. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.0...v1.9.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 5bec504a08..79c74cd7f0 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.0 +rocm-docs-core==1.9.1 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 8881c0e746..426073037f 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.0 +rocm-docs-core==1.9.1 # via -r requirements.in six==1.16.0 # via pybtex From 78f0fea08eafa7e3da49cbb3d77c962cecb3ae0b Mon Sep 17 00:00:00 2001 From: aledudek Date: Fri, 29 Nov 2024 11:52:18 +0100 Subject: [PATCH 27/52] Ck tile batched gemm example (#1615) * [CK Tile] Batched GEMM Example * [CK Tile] Batched GEMM Example - minor refactor * [CK Tile] Batched GEMM Example - README update * [CK Tile] Batched Gemm Example - review changes - Added tensor data layours as input parameters - Changed structure of Host and Kernel args - Removed bug with invalid vector read on non-contiguous memory * [CK Tile] Batched Gemm Example - remove comment * [CK Tile] Batched Gemm Example - Add GTests part1 * [CK Tile] Batched Gemm Example - GTests part2 + review changes * [CK TILE] Batched GEMM post merge fixes * [CK Tile] Batched GEMM Example - fix pad views --- .../ck_tile/16_batched_gemm/CMakeLists.txt | 1 + example/ck_tile/16_batched_gemm/README.md | 37 +++ .../ck_tile/16_batched_gemm/batched_gemm.cpp | 103 +++++++ .../ck_tile/16_batched_gemm/batched_gemm.hpp | 63 +++++ .../run_batched_gemm_example.inc | 253 +++++++++++++++++ example/ck_tile/CMakeLists.txt | 2 +- .../ck_tile/host/reference/reference_gemm.hpp | 112 ++++++++ include/ck_tile/ops/gemm.hpp | 1 + .../ops/gemm/kernel/batched_gemm_kernel.hpp | 258 ++++++++++++++++++ .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 2 +- test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/batched_gemm/CMakeLists.txt | 4 + .../batched_gemm/test_batched_gemm.cpp | 29 ++ .../test_batched_gemm_ut_cases.inc | 9 + .../batched_gemm/test_batched_gemm_util.hpp | 225 +++++++++++++++ 15 files changed, 1098 insertions(+), 2 deletions(-) create mode 100644 example/ck_tile/16_batched_gemm/CMakeLists.txt create mode 100644 example/ck_tile/16_batched_gemm/README.md create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.cpp create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.hpp create mode 100644 example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc create mode 100644 include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp create mode 100644 test/ck_tile/batched_gemm/CMakeLists.txt create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm.cpp create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_util.hpp diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt new file mode 100644 index 0000000000..78e78c6b04 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt @@ -0,0 +1 @@ +add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp) diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md new file mode 100644 index 0000000000..34b56db526 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/README.md @@ -0,0 +1,37 @@ +# Batched GEMM + +This folder contains example for batched GEMM using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ +make tile_example_batched_gemm -j +``` +This will result in an executable `build/bin/tile_example_batched_gemm` + +## example +``` +args: + -m m dimension (default:256) + -n n dimension (default:128) + -k k dimension (default:128) + -a_layout A tensor data layout (default:R) (R for Row, C for Col) + -b_layout B tensor data layout (default:R) (R for Row, C for Col) + -c_layout C tensor data layout (default:R) (R for Row, C for Col) + -stride_a Tensor A stride (default:128) + -stride_b Tensor B stride (default:128) + -stride_c Tensor C stride (default:128) + -batch_stride_a Batch A stride (default:32768) + -batch_stride_b Batch B stride (default:16384) + -batch_stride_c Batch C stride (default:32768) + -batch_count Batch count (default:16) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2) + -e Absolute error tolerance (default:1e-5) + -prec data type. fp16/bf16/fp8/bf8 (default:fp16) + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) +``` \ No newline at end of file diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp new file mode 100644 index 0000000000..bfdd74126e --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "batched_gemm.hpp" + +template +float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) +{ + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile:: + GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + // ToDo: Will add the codegen part to test different pipeline policies in GEMM. + // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. + using Kernel = ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; +} + +#include "run_batched_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); } diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp new file mode 100644 index 0000000000..e252c0f673 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +struct BatchedGemmTypeConfig; + +template <> +struct BatchedGemmTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using AccDataType = float; + using CDataType = ck_tile::half_t; +}; + +using Types = BatchedGemmTypeConfig; + +// Specific type aliases for easy access +using ADataType = Types::ADataType; +using BDataType = Types::BDataType; +using AccDataType = Types::AccDataType; +using CDataType = Types::CDataType; + +struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs +{ +}; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "256", "m dimension") + .insert("n", "128", "n dimension") + .insert("k", "128", "k dimension") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("batch_stride_a", "32768", "Batch A stride") + .insert("batch_stride_b", "16384", "Batch B stride") + .insert("batch_stride_c", "32768", "Batch C stride") + .insert("batch_count", "16", "Batch count") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") + .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") + .insert("warmup", "50", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// host API +float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc new file mode 100644 index 0000000000..dacca2042e --- /dev/null +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + ck_tile::index_t batch_stride_A, + ck_tile::index_t batch_stride_B, + ck_tile::index_t batch_stride_C, + ck_tile::index_t batch_count, + int n_warmup, + int n_repeat) +{ + batched_gemm_kargs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.M = M; + args.N = N; + args.K = K; + args.stride_A = stride_A; + args.stride_B = stride_B; + args.stride_C = stride_C; + args.batch_stride_A = batch_stride_A; + args.batch_stride_B = batch_stride_B; + args.batch_stride_C = batch_stride_C; + args.batch_count = batch_count; + + float ave_time = batched_gemm( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + + std::string op_name{"Batched Gemm"}; + std::size_t flop = std::size_t(2) * batch_count * M * N * K; + std::size_t num_byte = sizeof(ADataType) * batch_count * M * K + + sizeof(BDataType) * batch_count * N * K + + sizeof(CDataType) * batch_count * M * N; + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C + << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B + << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : " + << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + return ave_time; +} + +template +int run_batched_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + ck_tile::index_t M = arg_parser.get_int("m"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t K = arg_parser.get_int("k"); + + ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + + ck_tile::index_t batch_stride_A = arg_parser.get_int("batch_stride_a"); + ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b"); + ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c"); + ck_tile::index_t batch_count = arg_parser.get_int("batch_count"); + + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + stride_A = f_get_default_stride(M, K, stride_A, a_layout); + stride_B = f_get_default_stride(K, N, stride_B, b_layout); + stride_C = f_get_default_stride(M, N, stride_C, c_layout); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, a_layout)); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, b_layout)); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, c_layout)); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + invoke_batched_gemm(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count, + n_warmup, + n_repeat); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + if(arg_parser.get_int("v") == 1) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + else if(arg_parser.get_int("v") == 2) + { + ck_tile::HostTensor c_m_n_gpu_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); + c_m_n_gpu_ref.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ck_tile::reference_batched_gemm_gpu(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_gpu_buf_ref, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); + + std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_batched_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + std::string a_layout = arg_parser.get_str("a_layout"); + std::string b_layout = arg_parser.get_str("b_layout"); + + if(a_layout == "R" && b_layout == "R") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else if(a_layout == "R" && b_layout == "C") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not + // work else if(a_layout == "C" && b_layout == "C") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + // } + // else if(a_layout == "C" && b_layout == "R") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + // } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 29305405bc..51ebb5bf07 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -15,4 +15,4 @@ add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) add_subdirectory(15_fused_moe) - +add_subdirectory(16_batched_gemm) diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index dbdef0e9c7..8bd1f5b048 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -183,4 +183,116 @@ void reference_gemm_gpu(DeviceMem& a_device, return; } + +template +void reference_batched_gemm_gpu(DeviceMem& a_device, + DeviceMem& b_device, + DeviceMem& c_device, + index_t M, + index_t N, + index_t K, + index_t stride_a, + index_t stride_b, + index_t stride_c, + index_t batch_stride_A, + index_t batch_stride_B, + index_t batch_stride_C, + index_t batch_count) +{ + + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)); + hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)); + hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)); + if(errA != hipSuccess) + { + std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) + << std::endl; + return; // Early exit on error + } + + if(errB != hipSuccess) + { + std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) + << std::endl; + return; // Early exit on error + } + + if(errC != hipSuccess) + { + std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) + << std::endl; + return; // Early exit on error + } + + errA = hipMemcpy(d_A, + a_device.GetDeviceBuffer(), + batch_count * M * K * sizeof(ADataType), + hipMemcpyHostToDevice); + if(errA != hipSuccess) + { + std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipMemcpy(d_B, + b_device.GetDeviceBuffer(), + batch_count * N * K * sizeof(BDataType), + hipMemcpyHostToDevice); + if(errB != hipSuccess) + { + std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; + } + + int totalElements = M * N; + int numThreadsPerBlock = 256; // Common choice for threads per block + int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; + + for(index_t batch_id = 0; batch_id < batch_count; ++batch_id) + { + ADataType* d_ATemp = d_A + batch_id * batch_stride_A; + BDataType* d_BTemp = d_B + batch_id * batch_stride_B; + CDataType* d_CTemp = d_C + batch_id * batch_stride_C; + naive_gemm_kernel + <<>>( + d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c); + } + + errC = hipMemcpy(c_device.GetDeviceBuffer(), + d_C, + batch_count * M * N * sizeof(CDataType), + hipMemcpyDeviceToHost); + if(errC != hipSuccess) + { + std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; + } + + errA = hipFree(d_A); + if(errA != hipSuccess) + { + std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipFree(d_B); + if(errB != hipSuccess) + { + std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; + } + + errC = hipFree(d_C); + if(errC != hipSuccess) + { + std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; + } + + return; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 1340fb2048..b9eb248581 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,7 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp new file mode 100644 index 0000000000..07b4af5730 --- /dev/null +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +struct BatchedGemmHostArgs +{ + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; +}; + +template +struct BatchedGemmKernel +{ + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + struct BatchedGemmKargs + { + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; + }; + + using Kargs = BatchedGemmKargs; + using Hargs = BatchedGemmHostArgs; + + __host__ static constexpr auto GridSize(const Hargs& h) + { + return TilePartitioner::GridSize(h.M, h.N, h.batch_count); + } + + __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + + CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h) + { + Kargs k; + k.a_ptr = h.a_ptr; + k.b_ptr = h.b_ptr; + k.c_ptr = h.c_ptr; + k.M = h.M; + k.N = h.N; + k.K = h.K; + k.stride_A = h.stride_A; + k.stride_B = h.stride_B; + k.stride_C = h.stride_C; + k.batch_stride_A = h.batch_stride_A; + k.batch_stride_B = h.batch_stride_B; + k.batch_stride_C = h.batch_stride_C; + k.batch_count = h.batch_count; + return k; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto [i_m, i_n] = TilePartitioner{}(); + const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.z); + + // options + const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); + const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); + const ADataType* a_start = static_cast(kargs.a_ptr); + + const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); + const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); + const BDataType* b_start = static_cast(kargs.b_ptr); + + // Convert pointers to tensor views + auto a_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(kargs.stride_A, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(1, kargs.stride_A), + number<1>{}, + number<1>{}); + } + }(); + + auto b_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(1, kargs.stride_B), + number<1>{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(kargs.stride_B, 1), + number{}, + number<1>{}); + } + }(); + + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C); + const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C); + CDataType* c_start = static_cast(kargs.c_ptr); + auto c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + auto c_block_window = make_tile_window( + c_pad_view, + make_tuple(number{}, number{}), + {i_m, i_n}); + + EpiloguePipeline{}(c_block_window, c_block_tile); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index c0817e736b..822748c69b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -124,7 +124,7 @@ struct GemmPipelineAGmemBGmemCRegV1 b_lds_block, make_tuple(number{}, number{}), {0, 0}); // Block GEMM - constexpr auto block_gemm = Policy::template GetBlockGemm(); + auto block_gemm = Policy::template GetBlockGemm(); // Acc register tile auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){}; diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index ac9c4311df..fd0de0f9c1 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(image_to_column) add_subdirectory(gemm) +add_subdirectory(batched_gemm) diff --git a/test/ck_tile/batched_gemm/CMakeLists.txt b/test/ck_tile/batched_gemm/CMakeLists.txt new file mode 100644 index 0000000000..532ead1124 --- /dev/null +++ b/test/ck_tile/batched_gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp) +endif() diff --git a/test/ck_tile/batched_gemm/test_batched_gemm.cpp b/test/ck_tile/batched_gemm/test_batched_gemm.cpp new file mode 100644 index 0000000000..29bed8d2fd --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_batched_gemm_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Col, Row, F16, F16, F32, F16>//, + //std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileBatchedGemm, KernelTypes); + +#include "test_batched_gemm_ut_cases.inc" diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc new file mode 100644 index 0000000000..f261164d61 --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc @@ -0,0 +1,9 @@ +#pragma once + +TYPED_TEST(TestCkTileBatchedGemm, Basic) +{ + constexpr int M = 256; + constexpr int N = 128; + constexpr int K = 128; + this->Run(M, N, K); +} diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp new file mode 100644 index 0000000000..88145b987b --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +class TestCkTileBatchedGemm : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + + struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs + { + }; + + template + void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) + { + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile::GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using Kernel = + ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + + public: + void Run(const int M, + const int N, + const int K, + int StrideA = 128, + int StrideB = 128, + int StrideC = 128, + const int BatchStrideA = 32768, + const int BatchStrideB = 16384, + const int BatchStrideC = 32768, + const int BatchCount = 16) + { + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{})); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{})); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(), + b_k_n_dev_buf.GetDeviceBuffer(), + c_m_n_dev_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + BatchStrideA, + BatchStrideB, + BatchStrideC, + BatchCount}; + + invoke_batched_gemm(kargs, + ck_tile::stream_config{nullptr, false}); + + std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideC =" << StrideC + << " BatchStrideA =" << BatchStrideA << " BatchStrideB =" << BatchStrideB + << " BatchStrideC =" << BatchStrideC << " BatchCount =" << BatchCount + << std::endl; + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + EXPECT_TRUE(pass); + } +}; From 28e02cf5243107a8b2ea65e0a8ef0e1c4bba3964 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Nov 2024 07:18:43 -0800 Subject: [PATCH 28/52] Bump rocm-docs-core from 1.9.1 to 1.9.2 in /docs/sphinx (#1702) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.1 to 1.9.2. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.1...v1.9.2) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 79c74cd7f0..995dfaf027 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.1 +rocm-docs-core==1.9.2 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 426073037f..d8f7c38469 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.1 +rocm-docs-core==1.9.2 # via -r requirements.in six==1.16.0 # via pybtex From cff7fab798a867c9507fafe7beccd76afd0d16d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Sat, 30 Nov 2024 05:51:09 +0100 Subject: [PATCH 29/52] [CK TILE] Fix universal gemm template keywords (#1704) --- .../ops/gemm/block/block_universal_gemm_as_bs_cr.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp index c9e648f437..0fe0a9f40d 100644 --- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -623,7 +623,7 @@ struct BlockUniversalGemmAsBsCr CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window); + block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window); } // C += A * B @@ -632,7 +632,7 @@ struct BlockUniversalGemmAsBsCr const ASmemBlockWindow& a_block_window, const BSmemBlockWindow& b_block_window) { - block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + block_gemm_impl_(c_block_tensor, a_block_window, b_block_window); } // C = A * B @@ -641,7 +641,7 @@ struct BlockUniversalGemmAsBsCr const BSmemBlockWindow& b_block_window) { auto c_block_tensor = MakeCBlockTile(); - block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window); + block_gemm_impl_(c_block_tensor, a_block_window, b_block_window); return c_block_tensor; } From 44828b7c0f0d2d4cba5b40c8f2706f542a436aa9 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Sat, 30 Nov 2024 08:11:42 -0800 Subject: [PATCH 30/52] [Python] Add batched gemm instances parsing (#1684) * add op * do not insert ds parameters as they are already parsed * reset ds parameters * apply ruff --- .../batched_universal_gemm/gen_instances.py | 149 ++++++++++++++++++ .../ck4inductor/batched_universal_gemm/op.py | 99 ++++++++++++ .../grouped_conv_fwd/gen_instances.py | 4 +- 3 files changed, 249 insertions(+), 3 deletions(-) create mode 100644 python/ck4inductor/batched_universal_gemm/gen_instances.py create mode 100644 python/ck4inductor/batched_universal_gemm/op.py diff --git a/python/ck4inductor/batched_universal_gemm/gen_instances.py b/python/ck4inductor/batched_universal_gemm/gen_instances.py new file mode 100644 index 0000000000..8879fb93db --- /dev/null +++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +import logging +import os +import subprocess +from dataclasses import replace +from functools import lru_cache +from typing import List + +from ..util import library_path + +from .op import CKBatchedGemmOperation + +log = logging.getLogger(__name__) + + +def _ck_library_dir(): + gemm_instances_path = os.path.join( + library_path(), + "src", + "tensor_operation_instance", + "gpu", + "gemm_universal_batched", + ) + if not os.path.exists(gemm_instances_path): + log.error("CK library path %s does not exist", gemm_instances_path) + return None + return gemm_instances_path + + +def parse_instances(str_instances: List[str]) -> List[CKBatchedGemmOperation]: + """ + Parse the lines containing Universal Gemm template instances into `CKBatchedGemmOperation` instances + """ + + def maybe_int(s): + try: + return int(s) + except ValueError: + return s + + op_instances = [] + for line in str_instances: + s_template_args = line.split("DeviceBatchedGemmMultiD_Xdl_CShuffle_V3")[ + -1 + ].strip("<>, ") + template_args = [] + i_current = 0 + while i_current < len(s_template_args): + if s_template_args[i_current] == " ": + # skip whitespace + i_current += 1 + continue + elif s_template_args[i_current : i_current + 2] == "S<": + # parse template S + i_next = s_template_args.find(">", i_current) + template_args.append( + tuple(map(int, s_template_args[i_current + 2 : i_next].split(","))) + ) + i_current = i_next + 2 + else: + # all string attributes must be either type aliases or global constants in C++ + i_next = s_template_args.find(",", i_current) + template_args.append( + maybe_int( + s_template_args[i_current : i_next if i_next != -1 else None] + ) + ) + if i_next != -1: + i_current = i_next + 1 + if i_next == -1: + break + + # ds layout and dtype are parsed as placeholder; reset value + template_args[2] = tuple() # ds layout + template_args[6] = tuple() # ds dtype + + new_instance = CKBatchedGemmOperation( + *template_args, # type: ignore[arg-type] + ) + + op_instances.append(new_instance) + return op_instances + + +@lru_cache(None) +def gen_ops_library() -> List[CKBatchedGemmOperation]: + """ + Parse the Universal Gemm instances defined in the composable kernel library folder. + """ + ck_library_dir = _ck_library_dir() + if not ck_library_dir: + return [] + + grep_result = subprocess.run( + [ + "grep", + "-inR", + "DeviceBatchedGemmMultiD_Xdl_CShuffle_V3", + _ck_library_dir(), + ], + capture_output=True, + text=True, + ) + + op_instances = parse_instances(grep_result.stdout.strip().split("\n")) + + log.debug("ck instances from library: %d", len(op_instances)) + + schedulers = [ + "BlockGemmPipelineScheduler::Intrawave", + "BlockGemmPipelineScheduler::Interwave", + ] + gemm_specs = [ + "GemmSpecialization::Default", + "GemmSpecialization::MPadding", + "GemmSpecialization::NPadding", + "GemmSpecialization::KPadding", + "GemmSpecialization::MNPadding", + "GemmSpecialization::MKPadding", + "GemmSpecialization::NKPadding", + "GemmSpecialization::MNKPadding", + ] + + # substitute templated args by looping through their domains + substitute_instances = [] + for instance in op_instances: + sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" + sub_spec = instance.gemm_specialization == "GemmSpec" + schedulers_range = ( + schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler] + ) + spec_range = gemm_specs if sub_spec else [instance.gemm_specialization] + for scheduler in schedulers_range: + for spec in spec_range: + substitute_instances.append( + replace( + instance, + block_gemm_pipeline_scheduler=scheduler, + gemm_specialization=spec, + ) + ) + + return substitute_instances + + +if __name__ == "__main__": + print(gen_ops_library()) diff --git a/python/ck4inductor/batched_universal_gemm/op.py b/python/ck4inductor/batched_universal_gemm/op.py new file mode 100644 index 0000000000..96978ac8d2 --- /dev/null +++ b/python/ck4inductor/batched_universal_gemm/op.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +from dataclasses import asdict, dataclass +from typing import Optional, Tuple + + +@dataclass +class CKBatchedGemmOperation: + """ + A python dataclass storing the template parameters of a CK Universal Gemm template instance + """ + + a_layout: str + b_layout: str + ds_layouts: Tuple[str] # addmm specific + c_layout: str + + a_element_dtype: str + b_element_dtype: str + ds_element_dtypes: Tuple[str] # addmm specific + c_element_dtype: str + + acc_dtype: str + c_shuffle_dtype: str + + a_elementwise_op: str + b_elementwise_op: str + c_elementwise_op: str + + gemm_specialization: str + + block_size: int + + m_per_block: int + n_per_block: int + k_per_block: int + + a_k1: int + b_k1: int + + m_per_xdl: int + n_per_xdl: int + + m_xdl_per_wave: int + n_xdl_per_wave: int + + a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int] + a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + a_block_transfer_src_access_order: Tuple[int, int, int] + a_block_transfer_src_vector_dim: int + a_block_transfer_src_scalar_per_vector: int + a_block_transfer_dst_scalar_per_vector_ak1: int + a_block_lds_extra_m: bool + + b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int] + b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int] + b_block_transfer_src_access_order: Tuple[int, int, int] + + b_block_transfer_src_vector_dim: int + b_block_transfer_src_scalar_per_vector: int + b_block_transfer_dst_scalar_per_vector_bk1: int + b_block_lds_extra_n: bool + + c_shuffle_m_xdl_per_wave_per_shuffle: int + c_shuffle_n_xdl_per_wave_per_shuffle: int + + c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: ( + Tuple[int, int, int, int] + ) + c_shuffle_block_transfer_scalar_per_vector_n_per_block: Tuple[int] + block_gemm_pipeline_scheduler: str + block_gemm_pipeline_version: str + + a_compute_dtype: Optional[str] = None + b_compute_dtype: Optional[str] = None + + def name(self): + # cpp alias for template instance + return f"ck_device_batched_gemm_multi_d_xdl_c_shuffle_v3_{self.key_name()}" + + def key_name(self): + # TBD; must be unique per instance. Intended to use as dict key + return "_".join( + [ + "K" + + field_name.replace("_", "").lower() + + "V" + + ( + "x".join(map(str, iter(field_value))) + if isinstance(field_value, tuple) + else str(field_value).replace(":", "") + ) + for field_name, field_value in self.dict_items() + ] + ) + + def dict_items(self): + return asdict(self).items() diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py index ffbea6bdc7..feca20a3b8 100644 --- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py +++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py @@ -130,9 +130,7 @@ def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]: # substitute templated args by looping through their domains substitute_instances = [] for instance in op_instances: - sub_scheduler = ( - instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" - ) + sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched" sub_spec = instance.conv_forward_specialization == "ConvSpec" schedulers_range = ( schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler] From 9488f1c981cda8515b45952a14e539621150c1f6 Mon Sep 17 00:00:00 2001 From: rtmadduri Date: Mon, 2 Dec 2024 00:13:56 -0800 Subject: [PATCH 31/52] LWPCK-2429: Device grouped GEMM uses Async Memcpy (#1695) * LWPCK-2429: Device grouped GEMM uses Async Memcpy Resolving merge conflicts * reverting changes to profile_grouped_gemm * revert date change --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- .../impl/device_grouped_gemm_multiple_d_dl.hpp | 12 ++++++------ ...gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp | 10 +++++----- ...rouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp | 8 ++++---- .../gpu/device/impl/device_grouped_gemm_xdl.hpp | 12 ++++++------ .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 10 +++++----- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp index 060a16d1e2..959fc890b8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp @@ -1,6 +1,6 @@ #pragma once // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm Date: Mon, 2 Dec 2024 07:18:35 -0800 Subject: [PATCH 32/52] Bump rocm-docs-core from 1.9.2 to 1.10.0 in /docs/sphinx (#1706) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.2 to 1.10.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.2...v1.10.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 995dfaf027..9969824d25 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.9.2 +rocm-docs-core==1.10.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index d8f7c38469..bb731db2dd 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.9.2 +rocm-docs-core==1.10.0 # via -r requirements.in six==1.16.0 # via pybtex From 08d5c02c37253bf2a6852ad25f2db209f81c0fe7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 3 Dec 2024 08:42:55 -0800 Subject: [PATCH 33/52] OCP FP8 support for gfx12. (#1710) * (2/5) bilinear gemm pass, perf bug: skip a lds has lower performance than skip b lds * (3/5) batched gemm pass, perf bug: skip a lds has lower performance than skip b lds * (4/5) grouped conv pass * (5/5) attention pass, todo: debug lds perf bug * AIT Attention API refactor (#8) * sanity pass * sanity pass 2 * confirm significant performance regression. * turn on all instances * turn off instance format * Fix bug & tunning & format * DML meta, self_attn+cross_attn * sanity pass * remove useless flag * update tile and problem size used in AIT attention * bug fix in grouped conv supporting check * deprecate inline asm wmma * Bug fix: double lds skip * clang-format * Fix errors in 1. example, fmha 2. gridwise pipeline 3. deviceop, fmha, change some containers from vector to array * part2 of previous commit * clang format * API fix of gridwisegemmpipeline * separate array base and vector base attention tensor transformation * fix gemm * clang format * add gemm fp16 instances * Temp save * fpAintB kernel compile pass * Sanity pass. * Temp save * debug code enabled * Fp16AInt8B_GEMM sanity * MQA implementation * GQA-4 example * tempsave * Compile pass * New implementation of fp16Aint8B Gemm, Acheieve similar math throughput with native fp16 Gemm * Bump rocm-docs-core from 0.24.0 to 0.29.0 in /docs/sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.0 to 0.29.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.0...v0.29.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * initial enablement of gfx950 * fix clang format * disable examples 31 and 41 int8 on gfx950 * initial navi4x enablement * remove extra endif * enabled dl_gemm * update s_barrier and s_waitcnt for gfx12 * fix the gfx12 assembly syntax * fixed block_sync_lds * add support for more dl kernels on navi4 * add wmma * format * Todo: fix gemm_bilinear_wmma instances compilation bug * Solve a bug when K1=16 * remove unnecessary changes * Remove tensor layout limitation to LDS usage in tesnor contraction * fixed block_sync_lds * merge navi3_ref * update self-attention and cross-attention * fix a typo of name * fixed layout * debugging * Add arch limiter for fp8 gemm * fixed wmma * enable fp8 gemm_xdl for all gfx9 targets * temporarily disable gemm_xdl_fp16_fp8 on MI100/200 * fix the cmake logic for gemm_xdl_fp16_fp8 * fixed c_output * re-enable the gemm_xdl_fp16_fp8 on MI100/200 * fixed gfx12 * fixed * fixed * seperate gfx12 blockwise_gemm * fixed * enable fwd conv on navi4x * enable gridwise * enabled gemm * fixed merge * remove empty example fold * fixed conflicts * some small changes * Update cmake-ck-dev.sh * Update cmake-ck-dev.sh * enabled other types * fixed register loads * test fa * enable gfx12 * clean up * enable some instances on gfx12 * add gfx1201 macro in amd_wmma header * fix clang format * enable batched_gemm_softmax_gemm_perm_wmma for gfx12 * disable instances with blocksize=256 in attention examples * debuggging * debug * fixed lds_enabled * debugging * Fix and add limit to skiplds feature * Enable skipLds feature and fix compilation bugs * add ck_tile definitions for gfx12 * fix clang format and test/wmma_op * updage instances cmake for gfx12 * disable the test_wmma_op on gfx12 * fix the builds for gfx950 * add gfx12 and gfx950 to default target list * clean-up cmake file * Initial introduction of OFP8 data types. * Renamed FP8 and BF8 tests into FP8_FNUZ and BF8_FNUZ. * Implementation of ConvertFP32Nearest in test_fp8_ocp. * Remove dependence on possibly undeclared alias. * Implement FP8OCP test for stochastic rounding mode. * Implement FP8OCP tests for half_t type conversions. * enable bf16 atomic add on gfx950 * Implement ConvertFP32Nearest test. * Implement ConvertFP32Stochastic test. * Implement ConvertFP16Nearest and ConvertFP16Stochastic tests. * Refactoring. Move FP8 definitions into a separate header file. * Enable easy switching between architectures. * Fix compilation error for gfx942 architecture. * only builf gfx950 branch for gfx950 target by default * Enable OCP build of example_gemm_xdl_fp8. * Fix formatting. * fix the build logic for gfx950 * Improve GEMM example verbosity. * Add constexpr where applicable. * fix the logic of enabling XDL and WMMA instances * Improve GEMM example verbosity. * Enable build of example_gemm_xdl_fp8_bf8 test. * Fix tests for gfx1101 architecture. * Build DPP examples only on gfx103 and gfx11 architectures. * Optionaly run either CPU or GPU verifications with GEMM examples. * Extend GeneratorTensor_Sequential to produce values of prescribed data types. * Add missing constructor. * Improve infrastructure for OFP8 data type support. * BUGFIX. Should not use FP8 as Compute/Accum data type. * Add custom target for grouped_convnd_bwd_weight tests. * Can build `tests` target on gfx950. * Bugfixes on gfx1101 architecture. * Fix dependencies. * Provide single point of truth for FP8 INF and NAN checks * Prevent instantiation of operators that are not supported by FP8 data types * Add FP8 type selection into client_axample CMakeLists.txt * Prevent sccache server from shutting down during build * Fix test success reporting logic * Change default verification method to CPU. GPU verification takes too much time to complete on the emulator. * Make sure all tests and examples are built for gfx950 * Facilitate testing of FP8 data types on the emulator * Introduce two new tensor generators * Enable instances built for gfx94 to be built on gfx950 * Verify 35_splitk_gemm on floating point numbers. splitk gemm appears to be losing precision VS reference implementation when FP numbers are involved. * Verify 04_gemm_add_add_fastgelu on floating point numbers * Verify 20_grouped_conv_bwd_weight on floating point numbers * Verify 38_grouped_conv_bwd_data_multiple_d on floating point numbers * Verify more tests on floating point data * Fix data types and improve testing verbocity. * Upgrade to NPI 573 build docker. * Skip on gemm_universal tests. The tests take too long to complete on the emulator. Need to see if it is possible to reduce the scope of the testing to just FP8 data types. * Fix gfx1101 build * Document test availability * Re-enable fp8 gemms for gfx94/95 * Cherry-pick GEMM Universal tests for FP8 data types * Cleanup * CK_USE_GFX94 has already been set on this branch * Address formatting issues and leftovers * Make fail/pass logic consistent within 01_gemm folder Removed multiple negations in fail/pass logic to propagate `true` as the success indicator. * Fix GPU verification reporting logic. * Update year in copyright notice. * Cleanup * Use `enum class` instead of `enum` * Remove set_property for FP8 tests * Narrowing the scope of PR to OCP FP8 enablement only * Add tests for OCP FP8 vector_type storage * Enable gemm kernel on all gfx9 architectures (#227) * clean-up * Implement `non_native_vector_base` with `ext_vector_type` array. (#232) * Enable support of 1, 2, 4, and 8-byte custom types in CK. * Fix pool tests for OCP FP8 data type * fix jenkins file * restore cron trigger --------- Signed-off-by: dependabot[bot] Co-authored-by: aska-0096 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jing Zhang Co-authored-by: zjing14 Co-authored-by: Jun Liu Co-authored-by: Andriy Roshchenko Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> --- CMakeLists.txt | 11 +- client_example/CMakeLists.txt | 8 + example/01_gemm/common.hpp | 2 +- example/01_gemm/run_gemm_example.inc | 4 +- ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp | 8 +- .../grouped_gemm_multiple_d_xdl_fp16.cpp | 8 +- .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp | 6 +- .../grouped_gemm_xdl_fixed_nk_fp16.cpp | 4 +- .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp | 4 +- .../run_grouped_gemm_example.inc | 7 +- ...xdl_layernorm_naive_single_kernel_fp16.cpp | 6 +- .../run_batched_gemm_gemm_example.inc | 4 +- .../run_batched_gemm_scale_softmax_gemm.inc | 4 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 4 +- ...d_gemm_scale_softmax_gemm_permute_wmma.inc | 4 +- .../run_cross_attention_wmma.inc | 4 +- ...rouped_gemm_scale_softmax_gemm_permute.inc | 4 +- ...n_grouped_query_attention_forward_wmma.inc | 4 +- ...run_multi_query_attention_forward_wmma.inc | 4 +- .../run_self_attention_wmma.inc | 4 +- .../run_splitK_gemm_example.inc | 7 +- ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 2 +- .../common.hpp | 4 +- .../gemm_bias_softmax_gemm_permute_xdl.cpp | 4 +- ...mm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp | 8 +- ..._gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp | 6 +- ...emm_multiply_multiply_xdl_fp8_ab_scale.cpp | 3 - example/CMakeLists.txt | 7 + include/ck/library/utility/host_tensor.hpp | 2 +- .../library/utility/host_tensor_generator.hpp | 31 +- ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 3 +- ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp | 3 +- include/ck/utility/amd_buffer_addressing.hpp | 14 +- include/ck/utility/amd_ck_fp8.hpp | 988 ++++++++++++++++++ include/ck/utility/amd_xdlops.hpp | 2 +- include/ck/utility/data_type.hpp | 445 ++++++-- include/ck/utility/math_v2.hpp | 4 +- include/ck/utility/random_gen.hpp | 13 +- include/ck/utility/type_convert.hpp | 204 ++-- .../cpu/reference_gemm.hpp | 10 +- .../gpu/CMakeLists.txt | 4 +- ...evice_max_pool3d_fwd_ndhwc_f8_instance.cpp | 4 +- ...ed_gemm_bias_softmax_gemm_permute_impl.hpp | 4 +- .../profile_batched_gemm_gemm_impl.hpp | 4 +- ...profile_batched_gemm_softmax_gemm_impl.hpp | 4 +- ...batched_gemm_softmax_gemm_permute_impl.hpp | 4 +- .../include/profiler/profile_gemm_impl.hpp | 6 +- test/data_type/CMakeLists.txt | 37 +- .../{test_bf8.cpp => test_bf8_fnuz.cpp} | 135 +-- test/data_type/test_bf8_ocp.cpp | 268 +++++ test/data_type/test_custom_type.cpp | 158 +++ .../{test_fp8.cpp => test_fp8_fnuz.cpp} | 149 +-- test/data_type/test_fp8_ocp.cpp | 250 +++++ test/pool/test_avg_pool2d_fwd.cpp | 2 +- test/pool/test_max_pool2d_fwd.cpp | 2 +- 55 files changed, 2510 insertions(+), 385 deletions(-) create mode 100644 include/ck/utility/amd_ck_fp8.hpp rename test/data_type/{test_bf8.cpp => test_bf8_fnuz.cpp} (52%) create mode 100644 test/data_type/test_bf8_ocp.cpp rename test/data_type/{test_fp8.cpp => test_fp8_fnuz.cpp} (52%) create mode 100644 test/data_type/test_fp8_ocp.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b28a6d9127..2c86987561 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9") add_definitions(-DCK_USE_XDL) endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") - message("Enabling FP8 gemms in ckProfiler") + message("Enabling FP8 gemms on native architectures") add_definitions(-DCK_USE_GFX94) endif() if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") message("Enabling WMMA instances") add_definitions(-DCK_USE_WMMA) endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx12") + add_definitions(-DCK_USE_OCP_FP8) + set(CK_USE_OCP_FP8 "ON") +endif() +if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94") + add_definitions(-DCK_USE_FNUZ_FP8) + set(CK_USE_FNUZ_FP8 "ON") +endif() + option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF) if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908")) add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH) diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index c393972b42..ce5834d1e2 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -56,6 +56,14 @@ if (GPU_TARGETS) add_definitions(-DCK_USE_WMMA) set(CK_USE_WMMA "ON") endif() + if (GPU_TARGETS MATCHES "gfx12") + add_definitions(-DCK_USE_OCP_FP8) + set(CK_USE_OCP_FP8 "ON") + endif() + if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94") + add_definitions(-DCK_USE_FNUZ_FP8) + set(CK_USE_FNUZ_FP8 "ON") + endif() else() add_definitions(-DCK_USE_WMMA -DCK_USE_XDL) set(CK_USE_XDL "ON") diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index 67bf92bbbc..a3a62d4cfa 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -76,7 +76,7 @@ struct ProblemSizeSplitK final struct ExecutionConfig final { // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU - int do_verification = 3; + int do_verification = 1; int init_method = 2; bool time_kernel = false; }; diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index bafec3f358..3ee6e26856 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) switch(config.init_method) { case 0: - ck::utils::FillConstant{static_cast(1.f)}(a_m_k); - ck::utils::FillConstant{static_cast(1.f)}(b_k_n); + ck::utils::FillConstant{ck::type_convert(1.f)}(a_m_k); + ck::utils::FillConstant{ck::type_convert(1.f)}(b_k_n); break; case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index 8bbf8e629e..117a18e3bd 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); for(int j = 0; j < NumDMatrices; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); for(int j = 0; j < NumDMatrices; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential{}); } } } diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index e7b2ee4173..db162fe444 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -190,15 +190,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); for(int j = 0; j < NumDs; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); for(int j = 0; j < NumDs; ++j) { - d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); + d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential{}); } } } diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp index 3b3ef508ce..5bdc993192 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp @@ -167,11 +167,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } - d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } using GroupedGemmKernelArgument = ck::tensor_operation::device::GroupedGemmKernelArgument<1>; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp index c1043f419d..6806bd1886 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp @@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp index c81874b066..8418c10f5e 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp @@ -158,8 +158,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 7cb0588b82..64125cd1d0 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once struct ProblemSize final @@ -124,8 +127,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } } diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp index 90d80f9f03..277fea0272 100644 --- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -175,8 +175,8 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_m_k.GenerateTensorValue(GeneratorTensor_Sequential{}); + b_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); } c0_n_bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc index f329146728..d545508680 100644 --- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc +++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -150,7 +150,7 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc index 27602e2313..1514fc48b3 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -157,7 +157,7 @@ int run(int argc, char* argv[]) break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index fa76faea84..2b02069e65 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -118,7 +118,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc index 2e77479bcc..e0ccb6dad1 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -153,7 +153,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc index 9ff4c56e06..0ad031cc71 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -178,7 +178,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc index ea1e2734a6..cdfd86dff4 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -152,7 +152,7 @@ int run(int argc, char* argv[]) break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc index 609d085299..7ac29f33ca 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -156,7 +156,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc index b05915c07f..fb9b1b0bd7 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -156,7 +156,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc index 3fdaaebb0f..2cb69380e5 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -173,7 +173,7 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/example/35_splitK_gemm/run_splitK_gemm_example.inc index e3690984ab..cb1d3410c9 100644 --- a/example/35_splitK_gemm/run_splitK_gemm_example.inc +++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once struct ProblemSize final @@ -66,8 +69,8 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_m_k.GenerateTensorValue(GeneratorTensor_Sequential{}); + b_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index ff1282f3c7..f27dc60541 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -377,7 +377,7 @@ int main(int argc, char* argv[]) break; default: a0_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); d00_g_m_n.GenerateTensorValue(GeneratorTensor_1{1}); d01_g_m_n.GenerateTensorValue(GeneratorTensor_1{1}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp index 8a0474156c..6af8ac6488 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp +++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -41,7 +41,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; #define DefaultConvParams \ diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp index a90a6340a4..392cb155cb 100644 --- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp +++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -248,7 +248,7 @@ int main(int argc, char* argv[]) d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); break; default: - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp index 742fd5547a..055d253042 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -194,9 +194,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b1_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); - b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } d0_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp index 809c1a956c..1ba8133ea7 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp @@ -184,9 +184,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); + b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential{}); } d0_tensors[i].GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp index 2568754648..9b7849a654 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp @@ -205,7 +205,6 @@ int main(int argc, char* argv[]) a1_device_buf.ToDevice(a1_m_k.mData.data()); b0_device_buf.ToDevice(b0_k_n.mData.data()); b1_device_buf.ToDevice(b1_k_n.mData.data()); - e_device_buf.ToDevice(e_m_n_device_result.mData.data()); auto a_element_op = AElementOp{}; auto b_element_op = BElementOp{}; @@ -253,8 +252,6 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - if(do_verification) { Tensor c_m_n({M, N}); diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index ea739c7071..72759916af 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -54,6 +54,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) list(REMOVE_ITEM FILE_NAME "${source}") endif() endforeach() + #Do not build any DPP examples if DL_KERNELS not set + foreach(source IN LISTS FILE_NAME) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp") + message("removing dpp example ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() #Do not build any XDL examples if gfx9 targets are not on the list foreach(source IN LISTS FILE_NAME) if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl") diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp index a58acaf116..18e1db462a 100644 --- a/include/ck/library/utility/host_tensor.hpp +++ b/include/ck/library/utility/host_tensor.hpp @@ -326,7 +326,7 @@ struct Tensor std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); } - void SetZero() { ck::ranges::fill(mData, 0); } + void SetZero() { ck::ranges::fill(mData, T{0}); } template void ForEach_impl(F&& f, std::vector& idx, size_t rank) diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp index e87811b76b..ab9f01b53c 100644 --- a/include/ck/library/utility/host_tensor_generator.hpp +++ b/include/ck/library/utility/host_tensor_generator.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -37,7 +37,7 @@ struct GeneratorTensor_1 float value = 1.0; template - ck::bhalf_t operator()(Is...) + ck::half_t operator()(Is...) { return ck::type_convert(value); } @@ -62,7 +62,7 @@ struct GeneratorTensor_1 float value = 1.0; template - ck::bhalf_t operator()(Is...) + ck::f8_t operator()(Is...) { return ck::type_convert(value); } @@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard } }; -template +/** + * @brief Is used to generate sequential values based on the specified dimension. + * + * @tparam T The type of the tensor values. + * @tparam Dim The specific dimension used for generation. + * + * GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor: + * + * 0 1 2 + * 0 1 2 + * 0 1 2 + * + * Essentially, the values generated are logical coordinates of the generated element that + * correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column + * indices. + * + */ +template struct GeneratorTensor_Sequential { template - float operator()(Ts... Xs) const + T operator()(Ts... Xs) const { std::array dims = {{static_cast(Xs)...}}; - return dims[Dim]; + + float tmp = dims[Dim]; + return ck::type_convert(tmp); } }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index c1f58ccda5..a7f129b2b2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -111,8 +111,7 @@ __global__ void [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, [[maybe_unused]] const index_t num_k_per_block) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) // offset base pointer for each work-group const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp index da6b1b304e..813acfa656 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp @@ -38,8 +38,7 @@ __global__ void // __attribute__((amdgpu_waves_per_eu(1, 1))) kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index d4ee5c886c..5367c3d720 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && + (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); @@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave, #else - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); + vector_t tmp{amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0)}; return src_thread_element_valid ? tmp : vector_t(0); #endif } @@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, constexpr index_t vector_size = scalar_type::vector_size; - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); + vector_t tmp{amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0)}; return src_thread_element_valid ? tmp : vector_t(customized_value); } diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp new file mode 100644 index 0000000000..7b21ad6464 --- /dev/null +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -0,0 +1,988 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/random_gen.hpp" +#include "ck/utility/type.hpp" + +#ifdef CK_USE_FNUZ_FP8 +#define CK_USE_FNUZ_FP8 1 +#else +#define CK_USE_FNUZ_FP8 0 +#endif + +#ifdef CK_USE_OCP_FP8 +#define CK_USE_OCP_FP8 1 +#else +#define CK_USE_OCP_FP8 0 +#endif + +namespace ck { + +using f8_fnuz_t = _BitInt(8); +using bf8_fnuz_t = unsigned _BitInt(8); + +#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \ + defined(__gfx1201__)) && \ + __HIP_DEVICE_COMPILE__ +#define CK_FP8_CVT_FAST_PATH 1 +#else +#define CK_FP8_CVT_FAST_PATH 0 +#endif + +#if(defined(__gfx1200__) || defined(__gfx1201__)) && __HIP_DEVICE_COMPILE__ +#define CK_OCP_FP8_CVT_FAST_PATH 1 +#else +#define CK_OCP_FP8_CVT_FAST_PATH 0 +#endif + +typedef unsigned char fp8_storage_t; + +/** + * \brief Describes FP8 interpretation + */ +enum class ck_fp8_interpretation_t +{ + CK_E4M3_OCP = 0, // OCP E4M3 + CK_E5M2_OCP = 1, // OCP E5M2 + CK_E4M3_FNUZ = 2, // FP8 + CK_E5M2_FNUZ = 3, // BF8 +}; + +/** + * \brief Describes saturation behavior + */ +enum class ck_saturation_t +{ + CK_NOSAT = 0, // No saturation - replace with NaN or Inf + CK_SATFINITE = 1, // Saturate to finite +}; + +namespace fp8_impl { + +typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2))); +typedef float float2_t __attribute__((ext_vector_type(2))); + +__host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a) +{ + return static_cast(a) == 0x80; +} +__host__ __device__ static inline constexpr bool fnuz_bf8_is_nan(bf8_fnuz_t a) +{ + return static_cast(a) == 0x80; +} + +__host__ __device__ static inline constexpr bool ocp_f8_is_nan(fp8_storage_t a) +{ + return (a & 0x7f) == 0x7f; +} +__host__ __device__ static inline constexpr bool ocp_bf8_is_nan(fp8_storage_t a) +{ + return (a & 0x7f) > 0x7c; +} + +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220 +// This has been modified to handle double types as well +template +__host__ __device__ static inline T cast_from_f8(fp8_storage_t x) +{ + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, "only half, float and double are supported"); + + constexpr int weo = is_half ? 5 : (is_float ? 8 : 11); + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52); + + T fInf, fNegInf, fNaN, fNeg0, fmax, fmin; + if constexpr(is_half) + { + const unsigned short int ihInf = 0x7C00; + const unsigned short int ihNegInf = 0xFC00; + const unsigned short int ihNaN = 0x7C01; + const unsigned short int ihNeg0 = 0x8000; + /* Max number in e5m2 57344*/ + const unsigned short int ifmax = 0x7B00; + const unsigned short int ifmin = 0xFB00; + + fInf = bit_cast<_Float16>(ihInf); + fNegInf = bit_cast<_Float16>(ihNegInf); + fNaN = bit_cast<_Float16>(ihNaN); + fNeg0 = bit_cast<_Float16>(ihNeg0); + fmax = bit_cast<_Float16>(ifmax); + fmin = bit_cast<_Float16>(ifmin); + } + else if constexpr(is_float) + { + const unsigned int ifInf = 0x7F800000; + const unsigned int ifNegInf = 0xFF800000; + const unsigned int ifNaN = 0x7F800001; + const unsigned int ifNeg0 = 0x80000000; + /* Max number in e5m2 57344*/ + const unsigned int ifmax = 0x47600000; + const unsigned int ifmin = 0xC7600000; + + fInf = bit_cast(ifInf); + fNegInf = bit_cast(ifNegInf); + fNaN = bit_cast(ifNaN); + fNeg0 = bit_cast(ifNeg0); + fmax = bit_cast(ifmax); + fmin = bit_cast(ifmin); + } + else if constexpr(is_double) + { + const unsigned long long ifInf = 0x7FF0000000000000ull; + const unsigned long long ifNegInf = 0xFFF0000000000000ull; + const unsigned long long ifNaN = 0x7FF0000000000001ull; + const unsigned long long ifNeg0 = 0x8000000000000000ull; + /* Max number in e5m2 57344*/ + const unsigned long long ifmax = 0x40EC000000000000ull; + const unsigned long long ifmin = 0xC0EC000000000000ull; + + fInf = bit_cast(ifInf); + fNegInf = bit_cast(ifNegInf); + fNaN = bit_cast(ifNaN); + fNeg0 = bit_cast(ifNeg0); + fmax = bit_cast(ifmax); + fmin = bit_cast(ifmin); + } + + if(x == 0) + { + return 0; + } + + unsigned long long sign = x >> 7; + unsigned long long mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if constexpr(is_fnuz) + { + if(x == 0x80) + { + return fNaN; + } + } + else + { + if(x == 0x80) + { + return fNeg0; + } + if constexpr(we == 4) + { // e4m3 + if((x & 0x7F) == 0x7F) + { + return fNaN; + } + } + else if((x & 0x7C) == 0x7C) + { // e5m2 + if((x & 0x3) == 0) + { + if constexpr(clip) + { + return sign ? fmin : fmax; + } + return sign ? fNegInf : fInf; + } + return fNaN; + } + } + + typename __hip_internal::conditional< + sizeof(T) == 2, + unsigned short int, + typename __hip_internal::conditional:: + type>::type retval; + + if constexpr(we == 5 && is_half && !is_fnuz) + { + retval = x << 8; + return bit_cast(retval); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (is_fnuz ? 1 : 0); + + // subnormal input + if(exponent == 0) + { +#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + __clz(mantissa) - (32 - wm); +#else + int sh = 1 + __builtin_clz(mantissa) - (32 - wm); +#endif + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1ull << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if(exponent <= 0) + { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if constexpr(sizeof(T) == 2) + retval = (sign << 15) | (exponent << 10) | mantissa; + else if constexpr(sizeof(T) == 4) + retval = (sign << 31) | (exponent << 23) | mantissa; + else + retval = (sign << 63) | (static_cast(exponent) << 52) | mantissa; + + return bit_cast(retval); +} + +#if CK_FP8_CVT_FAST_PATH +template +static __device__ float cast_to_f32_from_f8(fp8_storage_t v) +{ + union + { + unsigned int i32val; + unsigned char i8val[4]; + } val; + val.i8val[0] = v; + + static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E4M3_OCP || + interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, + "Only FNUZ and OCP interpretations are supported"); + + if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)) + { + return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0); + } + else + { + return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0); + } +} + +template +static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v) +{ + const auto i16val = bit_cast(v); + + static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E4M3_OCP || + interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ || + interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, + "Only FNUZ and OCP interpretations are supported"); + + if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)) + { + return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, false); + } + else + { + return __builtin_amdgcn_cvt_pk_f32_bf8(i16val, false); + } +} + +#endif + +} // namespace fp8_impl + +struct f8_ocp_t +{ + using data_type = fp8_storage_t; + data_type data; + + static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE; + static constexpr ck_fp8_interpretation_t default_interpret = + ck_fp8_interpretation_t::CK_E4M3_OCP; + + static constexpr unsigned int we = 4; // exponent width + static constexpr unsigned int wm = 3; // mantissa width + + __host__ __device__ constexpr bool operator==(const f8_ocp_t& other) const + { + return (data == other.data) && (fp8_impl::ocp_f8_is_nan(data) == false); // NaN != NaN + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator float() const +#else + __host__ explicit operator float() const +#endif + { +#if CK_OCP_FP8_CVT_FAST_PATH + return fp8_impl::cast_to_f32_from_f8(this->data); +#else + return fp8_impl::cast_from_f8( + this->data); // XXX: clip==false must be consistent with operator _Float16 +#endif + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator _Float16() const +#else + __host__ explicit operator _Float16() const +#endif + { +#if CK_OCP_FP8_CVT_FAST_PATH + return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8(this->data)); +#else + return fp8_impl::cast_from_f8<_Float16, wm, we, false>( + this->data); // XXX: clip==false must be consistent with operator float +#endif + } +}; + +struct bf8_ocp_t +{ + using data_type = fp8_storage_t; + data_type data; + + static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE; + static constexpr ck_fp8_interpretation_t default_interpret = + ck_fp8_interpretation_t::CK_E5M2_OCP; + + static constexpr unsigned int we = 5; // exponent width + static constexpr unsigned int wm = 2; // mantissa width + + __host__ __device__ constexpr bool operator==(const bf8_ocp_t& other) const + { + return (data == other.data) && (fp8_impl::ocp_bf8_is_nan(data) == false); // NaN != NaN + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator float() const + +#else + __host__ explicit operator float() const +#endif + { +#if defined(__gfx1200__) || defined(__gfx1201__) + return fp8_impl::cast_to_f32_from_f8(this->data); +#else + return fp8_impl::cast_from_f8( + this->data); // XXX: clip==false must be consistent with operator _Float16 +#endif + } + +#if CK_USE_OCP_FP8 + __host__ __device__ explicit operator _Float16() const +#else + __host__ explicit operator _Float16() const +#endif + { +#if defined(__gfx1200__) || defined(__gfx1201__) + return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8(this->data)); +#else + return fp8_impl::cast_from_f8<_Float16, wm, we, false>( + this->data); // XXX: clip==false must be consistent with operator float +#endif + } +}; + +template +__host__ __device__ static inline constexpr bool fp8_is_nan(T); + +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(f8_ocp_t a) +{ + return fp8_impl::ocp_f8_is_nan(a.data); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(bf8_ocp_t a) +{ + return fp8_impl::ocp_bf8_is_nan(a.data); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(f8_fnuz_t a) +{ + return fp8_impl::fnuz_f8_is_nan(a); +} +template <> +__host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a) +{ + return fp8_impl::fnuz_bf8_is_nan(a); +} + +template || std::is_same_v || + std::is_same_v || std::is_same_v, + bool> = true> +__host__ __device__ static inline constexpr bool fp8_is_inf(T) +{ + return false; +} +template <> +__host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a) +{ + return (a.data & 0x7f) == 0x7c; +} + +namespace fp8_impl { + +// Assertions to check for supported conversion types +#define __assert_ocp_support(interp) \ + { \ + if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP && \ + interp != ck_fp8_interpretation_t::CK_E5M2_OCP) \ + { \ + __hip_assert(false && "type is unsupported by current target device"); \ + } \ + } +#define __assert_fnuz_support(interp) \ + { \ + if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ && \ + interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ) \ + { \ + __hip_assert(false && "type is unsupported by current target device"); \ + } \ + } + +__host__ __device__ static inline void +__is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp) +{ +#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ +#if CK_USE_OCP_FP8 + __assert_ocp_support(interp); +#endif +#if CK_USE_FNUZ_FP8 + __assert_fnuz_support(interp); +#endif +#endif +} + +#if CK_FP8_CVT_FAST_PATH +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79 +template +static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng = 0) +{ + fp8_storage_t i8data; + union + { + float fval; + unsigned int i32val; + unsigned char i8val[4]; // NOTE: not endian independent + } val; + + unsigned int ival = 0; + val.fval = v; + + if constexpr(saturate) + { + if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) + { + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + } + else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + { // OCP type + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 448.0, -448.0); + } + } + else + { + if((val.i32val & 0x7F800000) != 0x7F800000) + { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0); + } + } + } + + if constexpr(stochastic_rounding) + { + ival = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0) + : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos + val.i32val = ival; + i8data = val.i8val[0]; // little endian + } + else + { // RNE CVT + ival = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) || + (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP) + ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, + val.fval, + ival, + false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + } + return i8data; +} +#endif // CK_FP8_CVT_FAST_PATH + +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39 +// This has been modified to add double types conversion as well +template +__host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rng = 0) +{ + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, + "Only half, float and double can be cast to f8"); + + constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10); + + using T_bitwise = typename __hip_internal::conditional< + sizeof(T) == 2, + unsigned short int, + typename __hip_internal::conditional:: + type>::type; + T_bitwise x_bitwise = bit_cast(_x); + + unsigned long long x{x_bitwise}; + + unsigned long long head, mantissa; + int exponent, bias; + unsigned int sign; + unsigned long long fInf, mask; + + if constexpr(sizeof(T) == 8) + { + head = x & 0xFFF0000000000000ull; + mantissa = x & 0xFFFFFFFFFFFFFull; + exponent = (head >> 52) & 0x7FF; + sign = head >> 63; + bias = 1023; + fInf = 0x7FF0000000000000ull; + mask = 0x7FFFFFFFFFFFFFFFull; + } + else if constexpr(sizeof(T) == 4) + { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + fInf = 0x7F800000; + mask = 0x7FFFFFFF; + } + else + { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + fInf = 0x7C00; + mask = 0x7FFF; + } + unsigned int signed_inf = 0; + unsigned int nan = 0; + if constexpr(is_fnuz) + { + signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80; + nan = 0x80; + } + else + { + if constexpr(we == 4) + { // e4m3 + signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f); + } + else + { // e5m2 + signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c); + } + nan = (sign << 7) + 0x7f; + } + // Max values + unsigned long long ifmax = 0; + if constexpr(sizeof(T) == 8) + { + if constexpr(we == 5) + { // 57344 + ifmax = 0x40EC000000000000ull; + } + else + { + if constexpr(is_fnuz) + { // 240 + ifmax = 0x406E000000000000ull; + } + else + { // 448 + ifmax = 0x407C000000000000ull; + } + } + } + else if(sizeof(T) == 4) + { + if constexpr(we == 5) + { + ifmax = 0x47600000; + } + else + { + if constexpr(is_fnuz) + { + ifmax = 0x43700000; + } + else + { + ifmax = 0x43E00000; + } + } + } + else + { + if constexpr(we == 5) + { + ifmax = 0x7B00; + } + else + { + if constexpr(is_fnuz) + { + ifmax = 0x5B80; + } + else + { + ifmax = 0x5F00; + } + } + } + // Deal with inf and NaNs + if((x & fInf) == fInf) + { + if constexpr(is_fnuz) + return signed_inf; + + return mantissa != 0 ? nan : signed_inf; + } + + if((x & mask) > ifmax) + { + return signed_inf; + } + + if(x == 0) + { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of + // implicit 1 Then need to adjust the exponent to align with the F8 exponent, + // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng + // to mantissa and truncate. And for RNE, no need to add rng. Then probably + // need to check whether there is carry and adjust exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent + // bits + const int f8_bias = (1 << (we - 1)) - 1 + (is_fnuz ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if(exponent == 0) + { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we + mostly concern fp16 here. In this case, f8 is usually in denormal. But there + could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has + exponent bias 16. It means that there are some numbers in fp16 denormal but they + are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers + where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 + (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } + else + { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if(act_exponent <= f8_denormal_act_exponent) + { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal + range. For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16 + actual exponent is -7, it is actually larger due to the implicit 1, + Therefore it needs to be adjust to -6 and mantissa shift right by 1. + So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } + else + { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no difference + // for this case, act_exponent could be larger. Just + // that it does not need shift mantissa + } + mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) == + (1ull << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be + done before we shift right as shift right could rip off some residual part and + make something not midpoint look like midpoint. For example, the fp16 number + 0x1002 (0 00100 0000000010), it is larger than midpoint, but after shift right + by 4 bits, it would look like midpoint. + */ + + if(exponent_diff > 0) + mantissa >>= exponent_diff; + else if(exponent_diff == -1) + mantissa <<= -exponent_diff; + bool implicit_one = mantissa & (1ull << mfmt); + // if there is no implicit 1, it means the f8 is denormal and need to adjust + // to denorm exponent + f8_exponent = + (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1; + bool odd = + mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1 + mantissa += + (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask; + + // Now we deal with overflow + if(f8_exponent == 0) + { + if((1ull << mfmt) & mantissa) + { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } + else + { + if((1ull << (mfmt + 1)) & mantissa) + { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - 1; + if(f8_exponent > max_exp) + { + if constexpr(clip) + { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } + else + { + return signed_inf; + } + } + + if(f8_exponent == 0 && mantissa == 0) + return is_fnuz ? 0 : (sign << 7); + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +/** + * \brief convert float to @p fp8_storage_t + * + * \tparam interp interpretation of fp8 + * \tparam sat saturation of fp8 + * \param f float number + * \return fp8_storage_t + */ +template +#if CK_FP8_CVT_FAST_PATH +__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ + __is_interpret_supported(interp); + uint32_t rng = 0; + if constexpr(stochastic_rounding) + { + constexpr int seed = 1254739; + rng = prand_generator(reinterpret_cast(&f), f); + } + return cast_to_f8_from_f32( + f, rng); +#else +#if CK_USE_OCP_FP8 +__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ +#else +__host__ static inline fp8_storage_t cvt_float_to_fp8(const float f) +{ +#endif + uint32_t rng = 0; + if constexpr(stochastic_rounding) + { + constexpr int seed = 1254739; + rng = prand_generator(reinterpret_cast(&f), f); + } + + if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_FNUZ) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP) + { + return cast_to_f8(f, rng); + } + else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP) + { + return cast_to_f8(f, rng); + } + else + { + __hip_assert(false && "FP8 type is not supported by current target device"); + return 0; + } +#endif // CK_FP8_CVT_FAST_PATH +} + +/** + * \brief convert _Float16 to @p fp8_storage_t + * + * \tparam sat saturation of fp8 + * \tparam interp interpretation of fp8 + * \tparam stochastic_rounding switch between RNE and SR + * \param x _Float16 value + * \return fp8_storage_t + */ +template +#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8 +__host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x) +#else +__host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x) +#endif +{ + return cvt_float_to_fp8(static_cast(x)); +} + +} // namespace fp8_impl + +// Declare a template function for fp8 conversion using RNE +template +__host__ __device__ constexpr Y f8_convert_rne(X x); + +// convert fp32 to fp8 with rounding to nearest even +template <> +inline __host__ __device__ f8_ocp_t f8_convert_rne(float x) +{ + return f8_ocp_t{ + fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert fp32 to bf8 with rounding to nearest even +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_rne(float x) +{ + return bf8_ocp_t{ + fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert _Float16 to fp8 with rounding to nearest even +template <> +inline __host__ __device__ f8_ocp_t f8_convert_rne(_Float16 x) +{ + return f8_ocp_t{ + fp8_impl::cvt_half_t_to_fp8(x)}; +} + +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_rne(_Float16 x) +{ + return bf8_ocp_t{ + fp8_impl::cvt_half_t_to_fp8( + x)}; +} + +// Declare a template function for fp8 conversion using RNE +template +__host__ __device__ constexpr Y f8_convert_sr(X x); + +// convert fp32 to fp8 with stochastic rounding +template <> +inline __host__ __device__ f8_ocp_t f8_convert_sr(float x) +{ + return f8_ocp_t{ + fp8_impl::cvt_float_to_fp8( + x)}; +} + +// convert fp32 to bf8 with stochastic rounding +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_sr(float x) +{ + return bf8_ocp_t{fp8_impl::cvt_float_to_fp8(x)}; +} + +// convert _Float16 to fp8 with stochastic rounding +template <> +inline __host__ __device__ f8_ocp_t f8_convert_sr(_Float16 x) +{ + return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8(x)}; +} + +// convert _Float16 to bf8 with stochastic rounding +template <> +inline __host__ __device__ bf8_ocp_t f8_convert_sr(_Float16 x) +{ + return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8(x)}; +} + +#if CK_USE_OCP_FP8 +using f8_t = f8_ocp_t; +using bf8_t = bf8_ocp_t; +#define CK_FP8_TYPE_FNUZ 0 +#define CK_FP8_TYPE_OCP 1 +#else +using f8_t = f8_fnuz_t; +using bf8_t = bf8_fnuz_t; +#define CK_FP8_TYPE_FNUZ 1 +#define CK_FP8_TYPE_OCP 0 +#endif + +} // namespace ck diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index a955279bc8..5a7030cca7 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -4,7 +4,7 @@ #pragma once namespace ck { -// Define the common macro for gfx94x models +// Define the common macro for MI300 models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 39f532e0e9..a7dc071bc2 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/statically_indexed_array.hpp" namespace ck { @@ -10,8 +11,6 @@ namespace ck { using bhalf_t = ushort; using half_t = _Float16; using int4_t = _BitInt(4); -using f8_t = _BitInt(8); -using bf8_t = unsigned _BitInt(8); inline constexpr auto next_pow2(uint32_t x) { @@ -19,14 +18,15 @@ inline constexpr auto next_pow2(uint32_t x) return x > 1u ? (1u << (32u - __builtin_clz(x - 1u))) : x; } -// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_t, bf8_t, bool +// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t, +// native types: bool template inline constexpr bool is_native_type() { return is_same::value || is_same::value || is_same::value || is_same::value || is_same::value || is_same::value || - is_same::value || is_same::value || is_same::value || - is_same::value; + is_same::value || is_same::value || + is_same::value || is_same::value; } // vector_type @@ -166,16 +166,30 @@ struct scalar_type #endif template <> -struct scalar_type +struct scalar_type { - using type = f8_t; + using type = f8_fnuz_t; static constexpr index_t vector_size = 1; }; template <> -struct scalar_type +struct scalar_type { - using type = bf8_t; + using type = bf8_fnuz_t; + static constexpr index_t vector_size = 1; +}; + +template <> +struct scalar_type +{ + using type = f8_ocp_t::data_type; + static constexpr index_t vector_size = 1; +}; + +template <> +struct scalar_type +{ + using type = bf8_ocp_t::data_type; static constexpr index_t vector_size = 1; }; @@ -1010,60 +1024,203 @@ struct vector_type()>> } }; -template -struct non_native_vector_base +template +struct non_native_vector_base; + +template +struct nnvb_data_t_selector { - using type = non_native_vector_base; + using type = unsigned _BitInt(8 * sizeof(T)); +}; - __host__ __device__ non_native_vector_base() = default; - __host__ __device__ non_native_vector_base(const type&) = default; - __host__ __device__ non_native_vector_base(type&&) = default; - __host__ __device__ ~non_native_vector_base() = default; +template <> +struct nnvb_data_t_selector +{ + using type = f8_ocp_t::data_type; +}; +template <> +struct nnvb_data_t_selector +{ + using type = bf8_ocp_t::data_type; +}; - T d[N]; +template +struct non_native_vector_base< + T, + N, + std::enable_if_t> +{ + using data_t = typename nnvb_data_t_selector::type; // select data_t based on the size of T + static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch"); + using data_v = data_t __attribute__((ext_vector_type(N))); + using type = non_native_vector_base; + + union alignas(next_pow2(N * sizeof(T))) + { + data_v dN; // storage vector; + StaticallyIndexedArray dxN; + StaticallyIndexedArray dTxN; + StaticallyIndexedArray dNx1; + } data_; + + __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {} + __host__ __device__ constexpr non_native_vector_base(T f) + : non_native_vector_base(bit_cast(f)) + { + } + __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){}; + __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {} + + __host__ __device__ constexpr operator data_v() const { return data_.dN; } + __host__ __device__ constexpr operator data_t() const + { + if constexpr(N == 1) + { + return data_.dxN[Number<0>{}]; + } + else + { + return data_.dxN; // XXX this should cause an error + } + } + __host__ __device__ constexpr operator T() const + { + if constexpr(N == 1) + { + return data_.dTxN[Number<0>{}]; + } + else + { + return data_.dTxN; // XXX this should cause an error + } + } + + template + __host__ __device__ constexpr const auto& AsType() const + { + static_assert(is_same_v || is_same_v || is_same_v, + "Something went wrong, please check src and dst types."); + + if constexpr(is_same_v) + { + return data_.dxN; + } + else if constexpr(is_same_v) + { + return data_.dTxN; + } + else if constexpr(is_same_v) + { + return data_.dNx1; + } + else + { + return err; + } + } + + template + __host__ __device__ constexpr auto& AsType() + { + static_assert(is_same_v || is_same_v || is_same_v, + "Something went wrong, please check src and dst types."); + + if constexpr(is_same_v) + { + return data_.dxN; + } + else if constexpr(is_same_v) + { + return data_.dTxN; + } + else if constexpr(is_same_v) + { + return data_.dNx1; + } + else + { + return err; + } + } +}; + +template +struct scalar_type>; + +template +struct scalar_type> +{ + using type = typename non_native_vector_base::data_t; + + static constexpr index_t vector_size = N; +}; + +template +struct scalar_type> +{ + using type = typename non_native_vector_base::data_t; + + static constexpr index_t vector_size = N; }; // non-native vector_type implementation template struct vector_type()>> { - using d1_t = T; - using type = d1_t; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using type = d1_nnv_t; union alignas(next_pow2(1 * sizeof(T))) { d1_t d1_; StaticallyIndexedArray d1x1_; + d1_nnv_t d1_nnv_; } data_; - __host__ __device__ constexpr vector_type() : data_{type{}} {} + __host__ __device__ constexpr vector_type() : data_{d1_t{}} {} __host__ __device__ constexpr vector_type(type v) : data_{v} {} template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value, + static_assert(is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - return data_.d1x1_; + if constexpr(is_same::value || is_same::value) + { + return data_.d1x1_; + } + else + { + return err; + } } template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value, + static_assert(is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - return data_.d1x1_; + if constexpr(is_same::value || is_same::value) + { + return data_.d1x1_; + } + else + { + return err; + } } }; template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; using type = d2_t; @@ -1081,10 +1238,11 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x2_; } @@ -1101,10 +1259,11 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x2_; } @@ -1122,9 +1281,10 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; using type = d4_t; @@ -1143,10 +1303,11 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x4_; } @@ -1167,10 +1328,11 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x4_; } @@ -1192,10 +1354,11 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; - using d8_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; + using d8_t = non_native_vector_base; using type = d8_t; @@ -1215,11 +1378,12 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x8_; } @@ -1244,11 +1408,12 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x8_; } @@ -1274,11 +1439,12 @@ struct vector_type()>> template struct vector_type()>> { - using d1_t = T; - using d2_t = non_native_vector_base; - using d4_t = non_native_vector_base; - using d8_t = non_native_vector_base; - using d16_t = non_native_vector_base; + using d1_t = T; + using d1_nnv_t = non_native_vector_base; + using d2_t = non_native_vector_base; + using d4_t = non_native_vector_base; + using d8_t = non_native_vector_base; + using d16_t = non_native_vector_base; using type = d16_t; @@ -1299,12 +1465,12 @@ struct vector_type()>> template __host__ __device__ constexpr const auto& AsType() const { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x16_; } @@ -1333,12 +1499,12 @@ struct vector_type()>> template __host__ __device__ constexpr auto& AsType() { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, "Something went wrong, please check src and dst types."); - if constexpr(is_same::value) + if constexpr(is_same::value || is_same::value) { return data_.d1x16_; } @@ -1632,20 +1798,70 @@ using int8x32_t = typename vector_type::type; using int8x64_t = typename vector_type::type; // f8 -using f8x2_t = typename vector_type::type; -using f8x4_t = typename vector_type::type; -using f8x8_t = typename vector_type::type; -using f8x16_t = typename vector_type::type; -using f8x32_t = typename vector_type::type; -using f8x64_t = typename vector_type::type; +using f8x2_fnuz_t = typename vector_type::type; +using f8x4_fnuz_t = typename vector_type::type; +using f8x8_fnuz_t = typename vector_type::type; +using f8x16_fnuz_t = typename vector_type::type; +using f8x32_fnuz_t = typename vector_type::type; +using f8x64_fnuz_t = typename vector_type::type; // bf8 -using bf8x2_t = typename vector_type::type; -using bf8x4_t = typename vector_type::type; -using bf8x8_t = typename vector_type::type; -using bf8x16_t = typename vector_type::type; -using bf8x32_t = typename vector_type::type; -using bf8x64_t = typename vector_type::type; +using bf8x2_fnuz_t = typename vector_type::type; +using bf8x4_fnuz_t = typename vector_type::type; +using bf8x8_fnuz_t = typename vector_type::type; +using bf8x16_fnuz_t = typename vector_type::type; +using bf8x32_fnuz_t = typename vector_type::type; +using bf8x64_fnuz_t = typename vector_type::type; + +// f8 +using f8x2_ocp_t = typename vector_type::type; +using f8x4_ocp_t = typename vector_type::type; +using f8x8_ocp_t = typename vector_type::type; +using f8x16_ocp_t = typename vector_type::type; +using f8x32_ocp_t = typename vector_type::type; +using f8x64_ocp_t = typename vector_type::type; + +// bf8 +using bf8x2_ocp_t = typename vector_type::type; +using bf8x4_ocp_t = typename vector_type::type; +using bf8x8_ocp_t = typename vector_type::type; +using bf8x16_ocp_t = typename vector_type::type; +using bf8x32_ocp_t = typename vector_type::type; +using bf8x64_ocp_t = typename vector_type::type; + +#if CK_FP8_TYPE_OCP +// f8 +using f8x2_t = f8x2_ocp_t; +using f8x4_t = f8x4_ocp_t; +using f8x8_t = f8x8_ocp_t; +using f8x16_t = f8x16_ocp_t; +using f8x32_t = f8x32_ocp_t; +using f8x64_t = f8x64_ocp_t; + +// bf8 +using bf8x2_t = bf8x2_ocp_t; +using bf8x4_t = bf8x4_ocp_t; +using bf8x8_t = bf8x8_ocp_t; +using bf8x16_t = bf8x16_ocp_t; +using bf8x32_t = bf8x32_ocp_t; +using bf8x64_t = bf8x64_ocp_t; +#elif CK_FP8_TYPE_FNUZ +// f8 +using f8x2_t = f8x2_fnuz_t; +using f8x4_t = f8x4_fnuz_t; +using f8x8_t = f8x8_fnuz_t; +using f8x16_t = f8x16_fnuz_t; +using f8x32_t = f8x32_fnuz_t; +using f8x64_t = f8x64_fnuz_t; + +// bf8 +using bf8x2_t = bf8x2_fnuz_t; +using bf8x4_t = bf8x4_fnuz_t; +using bf8x8_t = bf8x8_fnuz_t; +using bf8x16_t = bf8x16_fnuz_t; +using bf8x32_t = bf8x32_fnuz_t; +using bf8x64_t = bf8x64_fnuz_t; +#endif // u8 using uint8x2_t = typename vector_type::type; @@ -1702,7 +1918,7 @@ struct NumericLimits #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> -struct NumericLimits +struct NumericLimits { // negative zero nan mode with exp bias = 8 static constexpr uint8_t binary_min = 0x08; // 0b00001000 @@ -1715,17 +1931,17 @@ struct NumericLimits // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111 // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0 - __host__ __device__ static constexpr f8_t Min() { return f8_t(binary_min); } + __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); } - __host__ __device__ static constexpr f8_t Max() { return f8_t(binary_max); } + __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); } - __host__ __device__ static constexpr f8_t Lowest() { return f8_t(binary_lowest); } + __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); } - __host__ __device__ static constexpr f8_t QuietNaN() { return f8_t(binary_qnan); } + __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); } }; template <> -struct NumericLimits +struct NumericLimits { // negative zero nan mode with exp bias = 16 static constexpr uint8_t binary_min = 0x04; // 0b00000100 @@ -1738,13 +1954,59 @@ struct NumericLimits // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!= - __host__ __device__ static constexpr bf8_t Min() { return bf8_t(binary_min); } + __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); } - __host__ __device__ static constexpr bf8_t Max() { return bf8_t(binary_max); } + __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); } - __host__ __device__ static constexpr bf8_t Lowest() { return bf8_t(binary_lowest); } + __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); } - __host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); } + __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6 + static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448 + static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448 + static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111 + + __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr f8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr f8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14 + static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344 + static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344 + static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101 + + __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr bf8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr bf8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } }; template @@ -1787,7 +2049,7 @@ struct NumericUtils }; template <> -struct NumericUtils +struct NumericUtils { static constexpr int exp = 4; static constexpr int mant = 3; @@ -1796,13 +2058,28 @@ struct NumericUtils }; template <> -struct NumericUtils +struct NumericUtils { static constexpr int exp = 5; static constexpr int mant = 2; static constexpr int bias = 16; // negative zero nan mode // static constexpr int bias = 15; // ieee mode }; +template <> +struct NumericUtils +{ + static constexpr int exp = 4; + static constexpr int mant = 3; + static constexpr int bias = 7; +}; + +template <> +struct NumericUtils +{ + static constexpr int exp = 5; + static constexpr int mant = 2; + static constexpr int bias = 15; +}; template <> struct NumericUtils diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index b374c4ad55..a6c3540d85 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x) return (xx & 0x7FFF) > 0x7C00; }; -static inline __host__ bool isnan(f8_t x) { return (x & 0x80); }; +static inline __host__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); }; #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 static inline __host__ bool isnan(int4_t x) @@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x) return (xx & 0x7FFF) > 0x7C00; }; -static inline __device__ bool isnan(f8_t x) { return (x & 0x80); }; +static inline __device__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); }; static inline __device__ half_t sqrt(half_t x) { diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp index b7edf26507..4ea52f7eb0 100644 --- a/include/ck/utility/random_gen.hpp +++ b/include/ck/utility/random_gen.hpp @@ -1,8 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/ck.hpp" + namespace ck { // Pseudo random number generator @@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = } // version for fp16 -template {}, bool> = false> +template {}, bool> = false> __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) { uint16_t x = *(reinterpret_cast(&val)); @@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = } // return 0 if data is not fp16 or fp32 -template {} || std::is_same{}), bool> = false> +template < + typename T, + uint32_t seed_t, + std::enable_if_t{} || std::is_same<_Float16, T>{}), bool> = false> __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t) { std::ignore = id; diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index 87fa9aa38a..f372756e68 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -9,7 +9,7 @@ #include "ck/utility/array.hpp" namespace ck { -// Define the common macro for gfx94x models +// Define the common macro for MI300 models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif @@ -100,6 +100,18 @@ inline __host__ __device__ constexpr bhalf_t type_convert(int8_ return type_convert(x_fp32); } +template <> +inline __host__ __device__ constexpr f8_ocp_t type_convert(int x) +{ + return f8_ocp_t{type_convert(x)}; +} + +template <> +inline __host__ __device__ constexpr bf8_ocp_t type_convert(int x) +{ + return bf8_ocp_t{type_convert(x)}; +} + // Convert X to Y template __host__ __device__ constexpr Y type_convert_sp(X x) @@ -163,7 +175,7 @@ __host__ __device__ constexpr Y f8_convert_sr(X x); // convert fp32 to fp8 with stochastic rounding template <> -inline __host__ __device__ f8_t f8_convert_sr(float x) +inline __host__ __device__ f8_fnuz_t f8_convert_sr(float x) { constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); @@ -189,33 +201,35 @@ inline __host__ __device__ f8_t f8_convert_sr(float x) constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; return utils:: - cast_to_f8(x, - rng); + cast_to_f8( + x, rng); #endif } // convert fp16 to fp8 with stochastic rounding template <> -inline __host__ __device__ f8_t f8_convert_sr(half_t x) +inline __host__ __device__ f8_fnuz_t f8_convert_sr(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_sr(type_convert(x)); + return f8_convert_sr(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp32 to bf8 with stochastic rounding template <> -inline __host__ __device__ bf8_t f8_convert_sr(float x) +inline __host__ __device__ bf8_fnuz_t f8_convert_sr(float x) { constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); @@ -240,28 +254,32 @@ inline __host__ __device__ bf8_t f8_convert_sr(float x) constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp16 to bf8 with stochastic rounding template <> -inline __host__ __device__ bf8_t f8_convert_sr(half_t x) +inline __host__ __device__ bf8_fnuz_t f8_convert_sr(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_sr(type_convert(x)); + return f8_convert_sr(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic; constexpr int seed = 1254739; uint32_t rng = prand_generator(reinterpret_cast(&x), x); - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } @@ -271,7 +289,7 @@ __host__ __device__ constexpr Y f8_convert_rne(X x); // convert fp32 to fp8 with rounding to nearest even template <> -inline __host__ __device__ f8_t f8_convert_rne(float x) +inline __host__ __device__ f8_fnuz_t f8_convert_rne(float x) { #if defined(__gfx94__) union @@ -296,32 +314,34 @@ inline __host__ __device__ f8_t f8_convert_rne(float x) constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; return utils:: - cast_to_f8(x, - rng); + cast_to_f8( + x, rng); #endif } // convert fp16 to fp8 with rounding to nearest even template <> -inline __host__ __device__ f8_t f8_convert_rne(half_t x) +inline __host__ __device__ f8_fnuz_t f8_convert_rne(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_rne(type_convert(x)); + return f8_convert_rne(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp32 to bf8 with rounding to nearest even template <> -inline __host__ __device__ bf8_t f8_convert_rne(float x) +inline __host__ __device__ bf8_fnuz_t f8_convert_rne(float x) { #if defined(__gfx94__) union @@ -345,44 +365,59 @@ inline __host__ __device__ bf8_t f8_convert_rne(float x) constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp16 to bf8 with rounding to nearest even template <> -inline __host__ __device__ bf8_t f8_convert_rne(half_t x) +inline __host__ __device__ bf8_fnuz_t f8_convert_rne(half_t x) { #if defined(__gfx94__) // convert to float and use native converion - return f8_convert_rne(type_convert(x)); + return f8_convert_rne(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; constexpr f8_rounding_mode rm = f8_rounding_mode::standard; constexpr uint32_t rng = 0; - return utils:: - cast_to_f8( - x, rng); + return utils::cast_to_f8(x, rng); #endif } // convert fp32 to fp8 template <> -inline __host__ __device__ f8_t type_convert(float x) +inline __host__ __device__ f8_fnuz_t type_convert(float x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); +#endif +} + +// convert fp32 to fp8 +template <> +inline __host__ __device__ f8_ocp_t type_convert(float x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp8 to fp32 template <> -inline __host__ __device__ float type_convert(f8_t x) +inline __host__ __device__ float type_convert(f8_fnuz_t x) { #if defined(__gfx94__) float fval; @@ -392,30 +427,44 @@ inline __host__ __device__ float type_convert(f8_t x) return fval; #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } template <> -inline __host__ __device__ float2_t type_convert(f8x2_t x) +inline __host__ __device__ float2_t type_convert(f8x2_fnuz_t x) { #if defined(__gfx94__) const auto i16val = bit_cast(x); return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0); #else constexpr bool negative_zero_nan = true; - const auto f8x2_v = vector_type(x); + const auto f8x2_v = vector_type(x); vector_type f32x2_v; f32x2_v.template AsType()(Number<0>{}) = - utils::cast_from_f8( - f8x2_v.template AsType()[Number<0>{}]); + utils::cast_from_f8( + f8x2_v.template AsType()[Number<0>{}]); f32x2_v.template AsType()(Number<1>{}) = - utils::cast_from_f8( - f8x2_v.template AsType()[Number<1>{}]); + utils::cast_from_f8( + f8x2_v.template AsType()[Number<1>{}]); return f32x2_v.template AsType()[Number<0>{}]; #endif } +template <> +inline __host__ __device__ float2_t type_convert(f8x2_ocp_t x) +{ +#if CK_OCP_FP8_CVT_FAST_PATH + return fp8_impl::cast_to_f32x2_from_f8x2( + x.AsType()[Number<0>{}]); +#else + return float2_t{fp8_impl::cast_from_f8( + x.AsType()[Number<0>{}]), + fp8_impl::cast_from_f8( + x.AsType()[Number<1>{}])}; +#endif +} + template <> inline __host__ __device__ half2_t type_convert(float2_t x) { @@ -428,42 +477,64 @@ inline __host__ __device__ half2_t type_convert(float2_t x) // convert fp16 to fp8 template <> -inline __host__ __device__ f8_t type_convert(half_t x) +inline __host__ __device__ f8_fnuz_t type_convert(half_t x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); +#endif +} + +// convert fp16 to fp8 +template <> +inline __host__ __device__ f8_ocp_t type_convert(half_t x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert fp8 to fp16 template <> -inline __host__ __device__ half_t type_convert(f8_t x) +inline __host__ __device__ half_t type_convert(f8_fnuz_t x) { #if defined(__gfx94__) // use native conversion to float and convert to fp16 return type_convert(type_convert(x)); #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } // convert fp32 to bf8 template <> -inline __host__ __device__ bf8_t type_convert(float x) +inline __host__ __device__ bf8_fnuz_t type_convert(float x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); +#endif +} + +// convert fp32 to bf8 +template <> +inline __host__ __device__ bf8_ocp_t type_convert(float x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert bf8 to fp32 template <> -inline __host__ __device__ float type_convert(bf8_t x) +inline __host__ __device__ float type_convert(bf8_fnuz_t x) { #if defined(__gfx94__) float fval; @@ -473,31 +544,42 @@ inline __host__ __device__ float type_convert(bf8_t x) return fval; #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } // convert fp16 to bf8 template <> -inline __host__ __device__ bf8_t type_convert(half_t x) +inline __host__ __device__ bf8_fnuz_t type_convert(half_t x) { #if CK_USE_SR_F8_CONVERSION - return f8_convert_sr(x); + return f8_convert_sr(x); #else - return f8_convert_rne(x); + return f8_convert_rne(x); +#endif +} + +// convert fp16 to bf8 +template <> +inline __host__ __device__ bf8_ocp_t type_convert(half_t x) +{ +#if CK_USE_SR_F8_CONVERSION + return f8_convert_sr(x); +#else + return f8_convert_rne(x); #endif } // convert bf8 to fp16 template <> -inline __host__ __device__ half_t type_convert(bf8_t x) +inline __host__ __device__ half_t type_convert(bf8_fnuz_t x) { #if defined(__gfx94__) // use native conversion to float and convert to fp16 return type_convert(type_convert(x)); #else constexpr bool negative_zero_nan = true; - return utils::cast_from_f8(x); + return utils::cast_from_f8(x); #endif } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index e1edc4fae0..1ae11fe9db 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator auto f_mk_kn_mn = [&](auto m, auto n) { const int K = arg.a_m_k_.mDesc.GetLengths()[1]; - AccDataType v_acc = 0; - ComputeTypeA v_a = 0; - ComputeTypeB v_b = 0; + AccDataType v_acc{0}; + ComputeTypeA v_a{0}; + ComputeTypeB v_b{0}; for(int k = 0; k < K; ++k) { @@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator ck::type_convert(v_a) * ck::type_convert(v_b); } - CDataType v_c = 0; + CDataType v_c{0}; arg.c_element_op_(v_c, v_acc); diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 2c0b6c7b75..dd023e6b51 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -62,7 +62,7 @@ function(add_instance_library INSTANCE_NAME) endforeach() # Do not build mha instances if gfx94 or gfx90a targets are not on the target list foreach(source IN LISTS ARGN) - if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha") + if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha") message("removing mha instance ${source} ") list(REMOVE_ITEM ARGN "${source}") endif() @@ -346,7 +346,7 @@ if(CK_DEVICE_CONV_INSTANCES) endif() if(CK_DEVICE_MHA_INSTANCES) set(gpu_list ${INST_TARGETS}) - if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a") + if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a") add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES}) add_library(composablekernels::device_mha_operations ALIAS device_mha_operations) target_compile_features(device_mha_operations PUBLIC) diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp index af31cf8a86..e31433cc81 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp @@ -15,7 +15,7 @@ void add_device_pool3d_fwd_ndhwc_f8_instances( instances) { add_device_operation_instances( - instances, device_pool3d_fwd_ndhwc_instances{}); + instances, device_pool3d_fwd_ndhwc_instances{}); } void add_device_pool3d_fwd_ndhwc_index_f8_instances( @@ -23,7 +23,7 @@ void add_device_pool3d_fwd_ndhwc_index_f8_instances( instances) { add_device_operation_instances( - instances, device_pool3d_fwd_ndhwc_instances{}); + instances, device_pool3d_fwd_ndhwc_instances{}); } } // namespace instance diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp index 5bee67c1ce..be69b67b5c 100644 --- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -150,7 +150,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1{1}); } diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp index f3d2c55617..b585b7d56a 100644 --- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -157,7 +157,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp index 15a21206c5..700ada73a1 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -174,7 +174,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, break; default: a_g_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp index f2fcb0b133..e3c462e21c 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -140,7 +140,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{1}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp index 0419ccd8e7..1373dbc497 100644 --- a/profiler/include/profiler/profile_gemm_impl.hpp +++ b/profiler/include/profiler/profile_gemm_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -74,8 +74,8 @@ int profile_gemm_impl(int do_verification, switch(init_method) { case 0: - ck::utils::FillConstant{static_cast(1.f)}(a_m_k); - ck::utils::FillConstant{static_cast(1.f)}(b_k_n); + ck::utils::FillConstant{type_convert(1.f)}(a_m_k); + ck::utils::FillConstant{type_convert(1.f)}(b_k_n); break; case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index a783be7bb0..a9d3dad7f3 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -9,13 +9,38 @@ if (USE_BITINT_EXTENSION_INT4) endif() endif() -add_gtest_executable(test_fp8 test_fp8.cpp) -if(result EQUAL 0) - target_link_libraries(test_fp8 PRIVATE utility) + + +add_custom_target(test_fp8) + +if (CK_USE_OCP_FP8) + add_gtest_executable(test_fp8_ocp test_fp8_ocp.cpp) + if(result EQUAL 0) + target_link_libraries(test_fp8_ocp PRIVATE utility) + endif() + + add_gtest_executable(test_bf8_ocp test_bf8_ocp.cpp) + if(result EQUAL 0) + target_link_libraries(test_bf8_ocp PRIVATE utility) + endif() + + add_dependencies(test_fp8 test_fp8_ocp) + add_dependencies(test_fp8 test_bf8_ocp) endif() -add_gtest_executable(test_bf8 test_bf8.cpp) -if(result EQUAL 0) - target_link_libraries(test_bf8 PRIVATE utility) + +if (CK_USE_FNUZ_FP8) + add_gtest_executable(test_fp8_fnuz test_fp8_fnuz.cpp) + if(result EQUAL 0) + target_link_libraries(test_fp8_fnuz PRIVATE utility) + endif() + + add_gtest_executable(test_bf8_fnuz test_bf8_fnuz.cpp) + if(result EQUAL 0) + target_link_libraries(test_bf8_fnuz PRIVATE utility) + endif() + + add_dependencies(test_fp8 test_fp8_fnuz) + add_dependencies(test_fp8 test_bf8_fnuz) endif() add_gtest_executable(test_custom_type test_custom_type.cpp) diff --git a/test/data_type/test_bf8.cpp b/test/data_type/test_bf8_fnuz.cpp similarity index 52% rename from test/data_type/test_bf8.cpp rename to test/data_type/test_bf8_fnuz.cpp index 6f50db68c7..4ff796a614 100644 --- a/test/data_type/test_bf8.cpp +++ b/test/data_type/test_bf8_fnuz.cpp @@ -5,158 +5,169 @@ #include "ck/utility/data_type.hpp" #include "ck/utility/type_convert.hpp" -using ck::bf8_t; +using ck::bf8_fnuz_t; using ck::f8_convert_rne; using ck::f8_convert_sr; using ck::half_t; using ck::type_convert; -TEST(BF8, NumericLimits) +TEST(BF8FNUZ, NumericLimits) { // constants given for negative zero nan mode - EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x04)); - EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); - EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); - EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); + EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x04)); + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); + EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); + EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); } -TEST(BF8, ConvertFP32Nearest) +TEST(BF8FNUZ, ConvertFP32Nearest) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to bf8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); // don't run the next test on gfx11 devices #ifndef CK_SKIP_FLAKY_F8_TEST // convert minimal float to bf8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_rne(std::numeric_limits::min())), + type_convert(f8_convert_rne(std::numeric_limits::min())), abs_tol); #endif - // convert maximal bf8_t to float and check if equal to 57344.0 - ASSERT_NEAR(57344.0f, type_convert(f8_convert_rne(57344.0f)), abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to float and check if equal to 57344.0 + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_rne(max_bf8_t_float)), abs_tol); // convert maximal float to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(57344.0f, - type_convert(f8_convert_rne(std::numeric_limits::max())), + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), abs_tol); - // convert inf float to bf8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(std::numeric_limits::infinity()), + // convert inf float to bf8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(std::numeric_limits::infinity()), abs_tol); // positive norm float value to bf8 and back, check if holds float pos_float = 0.0000762939f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative norm float value to bf8 and back, check if holds float neg_float = -0.0000610351f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); // positive subnorm float value to bf8 and back, check if holds pos_float = 0.0000305175f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative subnorm float value to bf8 and back, check if holds neg_float = -0.0000152587f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); } -TEST(BF8, ConvertFP32Stochastic) +TEST(BF8FNUZ, ConvertFP32Stochastic) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to bf8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); // convert minimal float to bf8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_sr(std::numeric_limits::min())), + type_convert(f8_convert_sr(std::numeric_limits::min())), abs_tol); - // convert maximal bf8_t to float and check if equal to 57344.0 - ASSERT_NEAR(57344.0f, type_convert(f8_convert_sr(57344.0f)), abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to float and check if equal to 57344.0 + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_sr(max_bf8_t_float)), abs_tol); // convert maximal float to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(57344.0f, - type_convert(f8_convert_sr(std::numeric_limits::max())), + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), abs_tol); - // convert inf float to bf8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(std::numeric_limits::infinity()), + // convert inf float to bf8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(std::numeric_limits::infinity()), abs_tol); // positive norm float value to bf8 and back, check if holds float pos_float = 0.0000762939f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative norm float value to bf8 and back, check if holds float neg_float = -0.0000610351f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); // positive subnorm float value to bf8 and back, check if holds pos_float = 0.0000305175f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative subnorm float value to bf8 and back, check if holds neg_float = -0.0000152587f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); } -TEST(BF8, ConvertFP16Nearest) +TEST(BF8FNUZ, ConvertFP16Nearest) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to bf8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); + ASSERT_NEAR( + half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); // convert minimal fp16 to bf8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_rne(ck::NumericLimits::Min())), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), abs_tol); - // convert maximal bf8_t to fp16 and check if equal to 57344.0 + + const auto max_bf8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0 ASSERT_NEAR( - half_t{57344.0}, type_convert(f8_convert_rne(half_t{57344.0})), abs_tol); + max_bf8_t_half, type_convert(f8_convert_rne(max_bf8_t_half)), abs_tol); // convert maximal fp16 to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(half_t{57344.0}, - type_convert(f8_convert_rne(ck::NumericLimits::Max())), + ASSERT_NEAR(max_bf8_t_half, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to bf8 and back, check if holds half_t pos_half = half_t{0.0000762939}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative norm fp16 value to bf8 and back, check if holds half_t neg_half = half_t{-0.0000610351}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); // positive subnorm fp16 value to bf8 and back, check if holds pos_half = half_t{0.0000305175}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative subnorm fp16 value to bf8 and back, check if holds neg_half = half_t{-0.0000152587}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); } -TEST(BF8, ConvertFP16Stochastic) +TEST(BF8FNUZ, ConvertFP16Stochastic) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to bf8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); // convert minimal fp16 to bf8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), abs_tol); - // convert maximal bf8_t to fp16 and check if equal to 57344.0 + + const auto max_bf8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0 ASSERT_NEAR( - half_t{57344.0}, type_convert(f8_convert_sr(half_t{57344.0})), abs_tol); + max_bf8_t_half, type_convert(f8_convert_sr(max_bf8_t_half)), abs_tol); // convert maximal fp16 to bf8 and back, check if clipped to 57344.0 - ASSERT_NEAR(half_t{57344.0}, - type_convert(f8_convert_sr(ck::NumericLimits::Max())), + ASSERT_NEAR(max_bf8_t_half, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to bf8 and back, check if holds half_t pos_half = half_t{0.0000762939}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative norm fp16 value to bf8 and back, check if holds half_t neg_half = half_t{-0.0000610351}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); // positive subnorm fp16 value to bf8 and back, check if holds pos_half = half_t{0.0000305175}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative subnorm fp16 value to bf8 and back, check if holds neg_half = half_t{-0.0000152587}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); } diff --git a/test/data_type/test_bf8_ocp.cpp b/test/data_type/test_bf8_ocp.cpp new file mode 100644 index 0000000000..9d4ee38b15 --- /dev/null +++ b/test/data_type/test_bf8_ocp.cpp @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" + +using ck::bf8_ocp_t; +using ck::f8_convert_rne; +using ck::f8_convert_sr; +using ck::half_t; +using ck::type_convert; + +TEST(BF8OCP, NumericLimits) +{ // constants given for OCP FP8 + EXPECT_EQ(ck::NumericLimits::Min(), + type_convert(0x04)); // 0b00000100 = 2^-14 + EXPECT_EQ(ck::NumericLimits::Max(), + type_convert(0x7B)); // 0b01111011 = 57344 + EXPECT_EQ(ck::NumericLimits::Lowest(), + type_convert(0xFB)); // 0b11111011 = -57344 + EXPECT_EQ(ck::NumericLimits::QuietNaN().data, + type_convert(0x7D).data); // 0b01111101 + EXPECT_FALSE(ck::NumericLimits::QuietNaN() == + ck::NumericLimits::QuietNaN()); + EXPECT_TRUE(ck::fp8_is_inf(type_convert(0xFC)) && + ck::fp8_is_inf(type_convert(0x7C))); +} + +TEST(BF8OCP, ConvertFP32Nearest) +{ + // fix the tolerance value + float abs_tol = 1e-6; + + // convert 0 float to bfp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), 0.0f); + + // convert minimal float to bf8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_rne(std::numeric_limits::min())), + abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to float and check if equal to bf8 max + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_rne(max_bf8_t_float)), 0.0f); + + // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_rne(std::numeric_limits::infinity())); + + // positive normal float value to bf8 and back, check if holds + float pos_float = 0.0000762939f; // 10*2^-17 + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14 + ASSERT_NEAR(neg_min_bf8, type_convert(f8_convert_rne(neg_min_bf8)), 0.0f); + + // positive subnorm float value to bf8 and back, check if holds + constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15 + ASSERT_NEAR( + pos_subnorm_bf8, type_convert(f8_convert_rne(pos_subnorm_bf8)), 0.0f); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16 + ASSERT_NEAR( + min_subnorm_bf8, type_convert(f8_convert_rne(min_subnorm_bf8)), 0.0f); + + // smaller than min subnorm bf8 value to bf8 must be zero + constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17 + ASSERT_EQ(0.0f, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_rne(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP32Stochastic) +{ + // fix the tolerance value + float abs_tol = 1e-6; + + // convert 0 float to bfp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), 0.0f); + + // convert minimal float to bf8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_sr(std::numeric_limits::min())), + abs_tol); + + const auto max_bf8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to float and check if equal to bf8 max + ASSERT_NEAR( + max_bf8_t_float, type_convert(f8_convert_sr(max_bf8_t_float)), 0.0f); + + // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_sr(std::numeric_limits::infinity())); + + // positive normal float value to bf8 and back, check if holds + float pos_float = 0.0000762939f; // 10*2^-17 + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14 + ASSERT_NEAR(neg_min_bf8, type_convert(f8_convert_sr(neg_min_bf8)), 0.0f); + + // positive subnorm float value to bf8 and back, check if holds + constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15 + ASSERT_NEAR( + pos_subnorm_bf8, type_convert(f8_convert_sr(pos_subnorm_bf8)), 0.0f); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16 + ASSERT_NEAR( + min_subnorm_bf8, type_convert(f8_convert_sr(min_subnorm_bf8)), 0.0f); + + // smaller than min subnorm bf8 value to bf8 alternates between 0 and 2^-16 + constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17 + ASSERT_NEAR(0.0f, + type_convert(f8_convert_sr(less_than_min_subnorm)), + 0.0000152587890625f); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_sr(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP16Nearest) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + + // convert 0 half_t to bfp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_rne(half_t_zero)), half_t_zero); + + // convert minimal half_t to bf8 and back, check if holds + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), + half_t_tol); + + const auto max_bf8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_rne(max_bf8_t_half_t)), + half_t_zero); + + // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_rne(type_convert(std::numeric_limits::infinity()))); + + // positive normal bf8 value to bf8 and back, check if holds + constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17 + ASSERT_NEAR( + pos_norm_bf8, type_convert(f8_convert_rne(pos_norm_bf8)), half_t_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14 + ASSERT_NEAR( + neg_min_bf8, type_convert(f8_convert_rne(neg_min_bf8)), half_t_zero); + + // positive subnorm bf8 value to bf8 and back, check if holds + constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15 + ASSERT_NEAR(pos_subnorm_bf8, + type_convert(f8_convert_rne(pos_subnorm_bf8)), + half_t_zero); + + // min subnorm bf8 value to bf8 and back, check if holds + constexpr half_t min_subnorm_bf8{-0.0000152587890625f}; //-2^-16 + ASSERT_NEAR(min_subnorm_bf8, + type_convert(f8_convert_rne(min_subnorm_bf8)), + half_t_zero); + + // smaller than min subnorm bf8 value to bf8 must be zero + constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17 + ASSERT_EQ(half_t_zero, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_rne(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} + +TEST(BF8OCP, ConvertFP16Stochastic) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + constexpr auto min_subnorm_bf8 = 0.0000152587890625f; // 2^-16 + + // convert 0 half_t to bfp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_sr(half_t_zero)), half_t_zero); + + // convert minimal half_t (6.103515625e-05) to fp8 and back + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), + half_t_zero); + + const auto max_bf8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_sr(max_bf8_t_half_t)), + half_t_zero); + + // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite) + ASSERT_NEAR(max_bf8_t_half_t, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_sr(type_convert(std::numeric_limits::infinity()))); + + // positive normal bf8 value to bf8 and back, check if holds + constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17 + ASSERT_NEAR( + pos_norm_bf8, type_convert(f8_convert_sr(pos_norm_bf8)), half_t_tol); + + // negative smallest normal bf8 value to bf8 and back, check if holds + constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14 + ASSERT_NEAR( + neg_min_bf8, type_convert(f8_convert_sr(neg_min_bf8)), half_t_zero); + + // positive subnorm bf8 value to bf8 and back, check if holds + constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15 + ASSERT_NEAR(pos_subnorm_bf8, + type_convert(f8_convert_sr(pos_subnorm_bf8)), + half_t_zero); + + // min subnorm bf8 value to bf8 and back, check if holds + ASSERT_NEAR(half_t{-min_subnorm_bf8}, + type_convert(f8_convert_sr(half_t{-min_subnorm_bf8})), + half_t_zero); + + // smaller than min subnorm bf8 value to bf8 alternates between 0 and 2^-16 + constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17 + ASSERT_NEAR(half_t_zero, + type_convert(f8_convert_sr(less_than_min_subnorm)), + half_t{min_subnorm_bf8}); + + // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN + const auto bf8_nan = f8_convert_sr(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data)); +} diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp index 1016812544..a8fa9ba4a0 100644 --- a/test/data_type/test_custom_type.cpp +++ b/test/data_type/test_custom_type.cpp @@ -872,3 +872,161 @@ TEST(Complex_half, TestAsTypeReshape) test_vec.at(num_elem * i + 1)); }); } + +#if CK_USE_OCP_FP8 + +TEST(FP8OCP, TestSize) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + ASSERT_EQ(sizeof(f8_t), sizeof(ck::fp8_storage_t)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); +} + +TEST(FP8OCP, TestAsType) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + + // test size + std::array test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), f8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the vector + vector_type left_vec{right_vec}; + + // check if values were copied correctly + ck::static_for<0, size, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); + + ck::non_native_vector_base nnvb_f8x2(ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_f8x2.template AsType()(Number<0>{}), ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_f8x2.template AsType()(Number<1>{}), ck::type_convert(-10.0f)); +} + +TEST(FP8OCP, TestAsTypeReshape) +{ + static_assert(std::is_same_v, "OCP FP8 is not enabled"); + + // test size + std::array test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), f8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the first half of a vector + vector_type left_vec{ + right_vec.template AsType::type>()(Number<0>{})}; + + // check if values were copied correctly + ck::static_for<0, size / 2, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); +} + +TEST(BF8OCP, TestSize) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + ASSERT_EQ(sizeof(bf8_t), sizeof(ck::fp8_storage_t)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); + ASSERT_EQ(sizeof(vector_type), sizeof(vector_type)); +} + +TEST(BF8OCP, TestAsType) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + + // test size + std::array test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), bf8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the vector + vector_type left_vec{right_vec}; + + // check if values were copied correctly + ck::static_for<0, size, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); + + ck::non_native_vector_base nnvb_bf8x2(ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_bf8x2.template AsType()(Number<0>{}), ck::type_convert(-10.0f)); + ASSERT_EQ(nnvb_bf8x2.template AsType()(Number<1>{}), ck::type_convert(-10.0f)); +} + +TEST(BF8OCP, TestAsTypeReshape) +{ + static_assert(std::is_same_v, "OCP BF8 is not enabled"); + + // test size + std::array test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16}; + constexpr int size = test_vec.size(); + + // reference vector + vector_type right_vec; + + // check default CTOR + ck::static_for<0, size, 1>{}( + [&](auto i) { ASSERT_EQ(right_vec.template AsType()(Number{}), bf8_t{0}); }); + + // assign test values to the vector + ck::static_for<0, size, 1>{}([&](auto i) { + right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); + }); + + // copy the first half of a vector + vector_type left_vec{ + right_vec.template AsType::type>()(Number<0>{})}; + + // check if values were copied correctly + ck::static_for<0, size / 2, 1>{}([&](auto i) { + ASSERT_EQ(left_vec.template AsType()(Number{}), + ck::type_convert(test_vec.at(i))); + }); +} + +#endif diff --git a/test/data_type/test_fp8.cpp b/test/data_type/test_fp8_fnuz.cpp similarity index 52% rename from test/data_type/test_fp8.cpp rename to test/data_type/test_fp8_fnuz.cpp index 25d9d9d2fb..c2ec6dad94 100644 --- a/test/data_type/test_fp8.cpp +++ b/test/data_type/test_fp8_fnuz.cpp @@ -7,154 +7,171 @@ using ck::f8_convert_rne; using ck::f8_convert_sr; -using ck::f8_t; +using ck::f8_fnuz_t; using ck::half_t; using ck::type_convert; -TEST(FP8, NumericLimits) +TEST(FP8FNUZ, NumericLimits) { // constants given for negative zero nan mode - EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x08)); - EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); - EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); - EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); + EXPECT_EQ(ck::NumericLimits::Min(), type_convert(0x08)); + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7F)); + EXPECT_EQ(ck::NumericLimits::Lowest(), type_convert(0xFF)); + EXPECT_EQ(ck::NumericLimits::QuietNaN(), type_convert(0x80)); } -TEST(FP8, ConvertFP32Nearest) +TEST(FP8FNUZ, ConvertFP32Nearest) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to fp8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), abs_tol); // don't run the next test on gfx11 devices #ifndef CK_SKIP_FLAKY_F8_TEST // convert minimal float to fp8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_rne(std::numeric_limits::min())), + type_convert(f8_convert_rne(std::numeric_limits::min())), abs_tol); #endif - // convert maximal f8_t to float and check if equal to 240.0 - ASSERT_NEAR(240.0f, type_convert(f8_convert_rne(240.0f)), abs_tol); - // convert maximal float to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(240.0f, - type_convert(f8_convert_rne(std::numeric_limits::max())), + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_rne(max_f8_t_float)), abs_tol); + + // XXX: FNUZ f8_convert_rne behavior is inconsistent. + // Clipping large values to fp8 max (saturation to finite) contradicts converting inf float to + // fp8 qNAN (no saturation). + + // convert maximal float to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), abs_tol); - // convert inf float to f8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(std::numeric_limits::infinity()), + // convert inf float to f8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(std::numeric_limits::infinity()), abs_tol); // positive norm float value to fp8 and back, check if holds float pos_float = 0.017578125f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative norm float value to fp8 and back, check if holds float neg_float = -0.015625f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); // positive subnorm float value to fp8 and back, check if holds pos_float = 0.00390625f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); // negative subnorm float value to fp8 and back, check if holds neg_float = -0.001953125f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), abs_tol); } -TEST(FP8, ConvertFP32Stochastic) +TEST(FP8FNUZ, ConvertFP32Stochastic) { // fix the tolerance value float abs_tol = 1e-6; // convert 0 float to fp8 and back, check if holds - ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), abs_tol); // convert minimal float to fp8 and back, check if holds ASSERT_NEAR(std::numeric_limits::min(), - type_convert(f8_convert_sr(std::numeric_limits::min())), + type_convert(f8_convert_sr(std::numeric_limits::min())), abs_tol); - // convert maximal f8_t to float and check if equal to 240.0 - ASSERT_NEAR(240.0f, type_convert(f8_convert_sr(240.0f)), abs_tol); - // convert maximal float to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(240.0f, - type_convert(f8_convert_sr(std::numeric_limits::max())), + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_sr(max_f8_t_float)), abs_tol); + // convert maximal float to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), abs_tol); - // convert inf float to f8_t and check if it is qNan - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(std::numeric_limits::infinity()), + // convert inf float to f8_fnuz_t and check if it is qNan + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(std::numeric_limits::infinity()), abs_tol); // positive norm float value to fp8 and back, check if holds float pos_float = 0.017578125f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative norm float value to fp8 and back, check if holds float neg_float = -0.015625f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); // positive subnorm float value to fp8 and back, check if holds pos_float = 0.00390625f; - ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); // negative subnorm float value to fp8 and back, check if holds neg_float = -0.001953125f; - ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), abs_tol); } -TEST(FP8, ConvertFP16Nearest) +TEST(FP8FNUZ, ConvertFP16Nearest) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to fp8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_rne(half_t{0.0})), abs_tol); // convert minimal fp16 to fp8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_rne(ck::NumericLimits::Min())), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), abs_tol); - // convert maximal f8_t to fp16 and check if equal to 240.0 - ASSERT_NEAR(half_t{240.0}, type_convert(f8_convert_rne(half_t{240.0})), abs_tol); - // convert maximal fp16 to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(half_t{240.0}, - type_convert(f8_convert_rne(ck::NumericLimits::Max())), + + const auto max_f8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_half, type_convert(f8_convert_rne(max_f8_t_half)), abs_tol); + // convert maximal fp16 to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_half, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to f8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_rne(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_rne(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to fp8 and back, check if holds half_t pos_half = half_t{0.017578125}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative norm fp16 value to fp8 and back, check if holds half_t neg_half = half_t{-0.015625}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); // positive subnorm fp16 value to fp8 and back, check if holds pos_half = half_t{0.00390625}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_rne(pos_half)), abs_tol); // negative subnorm fp16 value to fp8 and back, check if holds neg_half = half_t{-0.001953125}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_rne(neg_half)), abs_tol); } -TEST(FP8, ConvertFP16Stochastic) +TEST(FP8FNUZ, ConvertFP16Stochastic) { // fix the tolerance value float abs_tol = 1e-3; // convert 0 fp16 to fp8 and back, check if holds - ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); + ASSERT_NEAR(half_t{0.0}, type_convert(f8_convert_sr(half_t{0.0})), abs_tol); // convert minimal fp16 to fp8 and back, check if holds ASSERT_NEAR(ck::NumericLimits::Min(), - type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), abs_tol); - // convert maximal f8_t to fp16 and check if equal to 240.0 - ASSERT_NEAR(half_t{240.0}, type_convert(f8_convert_sr(half_t{240.0})), abs_tol); - // convert maximal fp16 to fp8 and back, check if clipped to 240.0 - ASSERT_NEAR(half_t{240.0}, - type_convert(f8_convert_sr(ck::NumericLimits::Max())), + + const auto max_f8_t_half = type_convert(ck::NumericLimits::Max()); + // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_half, type_convert(f8_convert_sr(max_f8_t_half)), abs_tol); + // convert maximal fp16 to fp8 and back, check if clipped to fp8 max + ASSERT_NEAR(max_f8_t_half, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), abs_tol); - // convert QuietNaN fp16 to f8_t and check if it is QuietNaN - ASSERT_NEAR(type_convert(0x80), - f8_convert_sr(ck::NumericLimits::QuietNaN()), + // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN + ASSERT_NEAR(ck::NumericLimits::QuietNaN(), + f8_convert_sr(ck::NumericLimits::QuietNaN()), abs_tol); // positive norm fp16 value to fp8 and back, check if holds half_t pos_half = half_t{0.017578125}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative norm fp16 value to fp8 and back, check if holds half_t neg_half = half_t{-0.015625}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); // positive subnorm fp16 value to fp8 and back, check if holds pos_half = half_t{0.00390625}; - ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); + ASSERT_NEAR(pos_half, type_convert(f8_convert_sr(pos_half)), abs_tol); // negative subnorm fp16 value to fp8 and back, check if holds neg_half = half_t{-0.001953125}; - ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); + ASSERT_NEAR(neg_half, type_convert(f8_convert_sr(neg_half)), abs_tol); } diff --git a/test/data_type/test_fp8_ocp.cpp b/test/data_type/test_fp8_ocp.cpp new file mode 100644 index 0000000000..a8077f1bdf --- /dev/null +++ b/test/data_type/test_fp8_ocp.cpp @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" + +using ck::f8_convert_rne; +using ck::f8_convert_sr; +using ck::f8_ocp_t; +using ck::half_t; +using ck::type_convert; + +TEST(FP8OCP, NumericLimits) +{ + // constants given for OCP FP8 + EXPECT_EQ(ck::NumericLimits::Min(), + type_convert(0x08)); // 0b00001000 = 2^-6 + EXPECT_EQ(ck::NumericLimits::Max(), type_convert(0x7E)); // 0b01111110 = 448 + EXPECT_EQ(ck::NumericLimits::Lowest(), + type_convert(0xFE)); // 0b11111110 = -448 + EXPECT_EQ(ck::NumericLimits::QuietNaN().data, + type_convert(0x7F).data); // 0b01111111 + EXPECT_FALSE(ck::NumericLimits::QuietNaN() == + ck::NumericLimits::QuietNaN()); +} + +TEST(FP8OCP, ConvertFP32Nearest) +{ + // fix the tolerance value + float abs_tol = 1e-6; + // convert 0 float to fp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_rne(0.0f)), 0.0f); + + // convert minimal float to fp8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_rne(std::numeric_limits::min())), + abs_tol); + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to float and check if equal to fp8 max + ASSERT_NEAR( + max_f8_t_float, type_convert(f8_convert_rne(max_f8_t_float)), 0.0f); + + // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_rne(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_rne(std::numeric_limits::infinity())); + + // positive norm float value to fp8 and back, check if holds + float pos_float = 0.017578125f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + float neg_float = -0.015625f; //-2^-6 + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), 0.0f); + + // positive subnorm float value to fp8 and back, check if holds + pos_float = 0.00390625f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_rne(pos_float)), abs_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_float = -0.001953125f; //-2^-9 + ASSERT_NEAR(neg_float, type_convert(f8_convert_rne(neg_float)), 0.0f); + + // smaller than min subnorm fp8 value to fp8 must be zero + auto less_than_min_subnorm = 0.0009765625f; // 2^-10 + ASSERT_EQ(0.0f, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_rne(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f); +} + +TEST(FP8OCP, ConvertFP32Stochastic) +{ + // fix the tolerance value + float abs_tol = 1e-6; + // convert 0 float to fp8 and back, check if holds + ASSERT_NEAR(0.0f, type_convert(f8_convert_sr(0.0f)), 0.0f); + + // convert minimal float to fp8 and back, check if holds + ASSERT_NEAR(std::numeric_limits::min(), + type_convert(f8_convert_sr(std::numeric_limits::min())), + abs_tol); + + const auto max_f8_t_float = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to float and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_float, type_convert(f8_convert_sr(max_f8_t_float)), 0.0f); + + // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_float, + type_convert(f8_convert_sr(std::numeric_limits::max())), + 0.0f); + + // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ(ck::NumericLimits::Max(), + f8_convert_sr(std::numeric_limits::infinity())); + + // positive norm float value to fp8 and back, check if holds + float pos_float = 0.017578125f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + float neg_float = -0.015625f; //-2^-6 + ASSERT_NEAR(neg_float, type_convert(f8_convert_sr(neg_float)), 0.0f); + + // positive subnorm float value to fp8 and back, check if holds + pos_float = 0.00390625f; + ASSERT_NEAR(pos_float, type_convert(f8_convert_sr(pos_float)), abs_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + constexpr auto min_subnorm_fp8 = -0.001953125f; //-2^-9 + ASSERT_NEAR( + min_subnorm_fp8, type_convert(f8_convert_sr(min_subnorm_fp8)), 0.0f); + + // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9 + auto less_than_min_subnorm = 0.0009765625f; // 2^-10 + ASSERT_NEAR( + 0.0f, type_convert(f8_convert_sr(less_than_min_subnorm)), 0.001953125f); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_sr(std::numeric_limits::quiet_NaN()); + ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f); +} + +TEST(FP8OCP, ConvertFP16Nearest) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + // convert 0 half_t to fp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_rne(half_t_zero)), half_t_zero); + + // convert minimal half_t to fp8 and back, check if holds + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_rne(ck::NumericLimits::Min())), + half_t_tol); + const auto max_f8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to half_t and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_rne(max_f8_t_half_t)), + half_t_zero); + + // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_rne(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_rne(type_convert(std::numeric_limits::infinity()))); + + // positive norm half_t value to fp8 and back, check if holds + half_t pos_half_t{0.017578125f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_rne(pos_half_t)), half_t_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + half_t neg_half_t{-0.015625f}; //-2^-6 + ASSERT_NEAR( + neg_half_t, type_convert(f8_convert_rne(neg_half_t)), half_t_zero); + + // positive subnorm half_t value to fp8 and back, check if holds + pos_half_t = half_t{0.00390625f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_rne(pos_half_t)), half_t_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_half_t = half_t{-0.001953125f}; //-2^-9 + ASSERT_NEAR( + neg_half_t, type_convert(f8_convert_rne(neg_half_t)), half_t_zero); + + // smaller than min subnorm fp8 value to fp8 must be zero + auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10 + ASSERT_EQ(half_t_zero, type_convert(f8_convert_rne(less_than_min_subnorm))); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_rne(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data)); +} + +TEST(FP8OCP, ConvertFP16Stochastic) +{ + // fix the tolerance value + constexpr half_t half_t_tol = 1e-3; + constexpr half_t half_t_zero = 0.0; + constexpr auto min_subnorm_fp8 = 0.001953125f; // 2^-9 + + // convert 0 half_t to fp8 and back, check if holds + ASSERT_NEAR( + half_t_zero, type_convert(f8_convert_sr(half_t_zero)), half_t_zero); + + // convert minimal half_t (6.103515625e-05) to fp8 and back + // alternates between 0 and 2^-9 (0.001953125) + ASSERT_NEAR(ck::NumericLimits::Min(), + type_convert(f8_convert_sr(ck::NumericLimits::Min())), + type_convert(min_subnorm_fp8)); + + const auto max_f8_t_half_t = type_convert(ck::NumericLimits::Max()); + + // convert maximal f8_ocp_t to half_t and check if equal to fp8 max + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_sr(max_f8_t_half_t)), + half_t_zero); + + // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite) + ASSERT_NEAR(max_f8_t_half_t, + type_convert(f8_convert_sr(ck::NumericLimits::Max())), + half_t_zero); + + // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite) + ASSERT_EQ( + ck::NumericLimits::Max(), + f8_convert_sr(type_convert(std::numeric_limits::infinity()))); + + // positive norm half_t value to fp8 and back, check if holds + half_t pos_half_t{0.017578125f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_sr(pos_half_t)), half_t_tol); + + // smallest normal fp8 value to fp8 and back, check if holds + half_t neg_half_t{-0.015625f}; //-2^-6 + ASSERT_NEAR(neg_half_t, type_convert(f8_convert_sr(neg_half_t)), half_t_zero); + + // positive subnorm half_t value to fp8 and back, check if holds + pos_half_t = half_t{0.00390625f}; + ASSERT_NEAR(pos_half_t, type_convert(f8_convert_sr(pos_half_t)), half_t_tol); + + // min subnorm fp8 value to fp8 and back, check if holds + neg_half_t = half_t{-min_subnorm_fp8}; //-2^-9 + ASSERT_NEAR(neg_half_t, type_convert(f8_convert_sr(neg_half_t)), half_t_zero); + + // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9 + auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10 + ASSERT_NEAR( + type_convert(half_t_zero), + type_convert(type_convert(f8_convert_sr(less_than_min_subnorm))), + min_subnorm_fp8); + + // convert quiet NaN to f8_ocp_t and check if it is quiet NaN + auto f8_nan = f8_convert_sr(ck::NumericLimits::QuietNaN()); + ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data)); +} diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp index 8dbb37b84f..b5e733419a 100644 --- a/test/pool/test_avg_pool2d_fwd.cpp +++ b/test/pool/test_avg_pool2d_fwd.cpp @@ -138,7 +138,7 @@ TYPED_TEST_SUITE(AvgPool2D_BF16, AvgPool2D_BF16_Types); TYPED_TEST_SUITE(AvgPool2D_I8, AvgPool2D_I8_Types); TYPED_TEST_SUITE(AvgPool2D_F8, AvgPool2D_F8_Types); -TYPED_TEST(AvgPool2D_F32, AvgPool2D_I8_Test) { this->Run(); } +TYPED_TEST(AvgPool2D_F32, AvgPool2D_F32_Test) { this->Run(); } TYPED_TEST(AvgPool2D_F16, AvgPool2D_F16_Test) { this->Run(); } TYPED_TEST(AvgPool2D_BF16, AvgPool2D_BF16_Test) { this->Run(); } TYPED_TEST(AvgPool2D_I8, AvgPool2D_I8_Test) { this->Run(); } diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp index 80ca47407b..2179242754 100644 --- a/test/pool/test_max_pool2d_fwd.cpp +++ b/test/pool/test_max_pool2d_fwd.cpp @@ -143,7 +143,7 @@ TYPED_TEST_SUITE(MaxPool2D_BF16, MaxPool2D_BF16_Types); TYPED_TEST_SUITE(MaxPool2D_I8, MaxPool2D_I8_Types); TYPED_TEST_SUITE(MaxPool2D_F8, MaxPool2D_F8_Types); -TYPED_TEST(MaxPool2D_F32, MaxPool2D_I8_Test) { this->Run(); } +TYPED_TEST(MaxPool2D_F32, MaxPool2D_F32_Test) { this->Run(); } TYPED_TEST(MaxPool2D_F16, MaxPool2D_F16_Test) { this->Run(); } TYPED_TEST(MaxPool2D_BF16, MaxPool2D_BF16_Test) { this->Run(); } TYPED_TEST(MaxPool2D_I8, MaxPool2D_I8_Test) { this->Run(); } From 5affda819de5624e83d8d90f883c0a87f80b7ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 4 Dec 2024 00:46:47 +0100 Subject: [PATCH 34/52] Add basic documentation structure (#1715) * Add basic documentation structure * Add terminology placeholder * Add codegen placeholder * Create template for each page --- CONTRIBUTORS.md | 1 + README.md | 34 ++++++++++++++--------------- TERMINOLOGY.md | 2 ++ client_example/25_wrapper/README.md | 11 +++------- client_example/README.md | 2 ++ codegen/README.md | 2 ++ example/README.md | 2 ++ include/ck/README.md | 19 ++++++++++++++++ include/ck_tile/README.md | 3 ++- profiler/README.md | 12 ++++++++++ 10 files changed, 62 insertions(+), 26 deletions(-) create mode 100644 TERMINOLOGY.md create mode 100644 codegen/README.md create mode 100644 example/README.md create mode 100644 include/ck/README.md diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index cdce5a4630..8ef5c2b726 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,3 +1,4 @@ +[Back to the main page](./README.md) # Composable Kernel Developers and Contributors This is the list of developers and contributors to Composable Kernel library diff --git a/README.md b/README.md index d8eb152ee9..c0872aa567 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,15 @@ The current CK library is structured into four layers: ## General information -To build our documentation locally, use the following code: - -``` bash -cd docs -pip3 install -r sphinx/requirements.txt -python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html -``` - -You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. - -```note -If you use CK, cite us as follows: - -* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???): - This paper will be available on arXiv soon. -* [CITATION.cff](/CITATION.cff) -``` +* [CK supported operations](include/ck/README.md) +* [CK Tile supported operations](include/ck_tile/README.md) +* [CK wrapper](client_example/25_wrapper/README.md) +* [CK codegen](codegen/README.md) +* [CK profiler](profiler/README.md) +* [Examples (Custom use of CK supported operations)](example/README.md) +* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md) +* [Terminology](/TERMINOLOGY.md) +* [Contributors](/CONTRIBUTORS.md) CK is released under the **[MIT license](/LICENSE)**. @@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa You can find instructions for running ckProfiler in [profiler](/profiler). +* Build our documentation locally: + + ``` bash + cd docs + pip3 install -r sphinx/requirements.txt + python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html + ``` + Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly. However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and crash. On average, you should expect each thread to use ~2Gb of RAM. diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md new file mode 100644 index 0000000000..e8833efb89 --- /dev/null +++ b/TERMINOLOGY.md @@ -0,0 +1,2 @@ +[Back to the main page](./README.md) +# Composable Kernel terminology \ No newline at end of file diff --git a/client_example/25_wrapper/README.md b/client_example/25_wrapper/README.md index eba3de017f..3db9a9af44 100644 --- a/client_example/25_wrapper/README.md +++ b/client_example/25_wrapper/README.md @@ -1,14 +1,9 @@ +[Back to the main page](../../README.md) # Composable Kernel wrapper GEMM tutorial -This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) -wrapper. We present the base version of GEMM without most of the available optimizations; however, -it's worth noting that CK has kernels with different optimizations. +This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations. -To implement these optimizations, you can use the CK wrapper or directly use available instances in -CK. You can also refer to the -[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), -that uses CK wrapper based on the -[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation. +To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation. The kernel definition should look similar to: diff --git a/client_example/README.md b/client_example/README.md index 64a7130d53..d9f793434d 100644 --- a/client_example/README.md +++ b/client_example/README.md @@ -1,3 +1,5 @@ +[Back to the main page](../README.md) +# Composable Kernel client examples ## Client application links to CK library, and therefore CK library needs to be installed before building client applications. diff --git a/codegen/README.md b/codegen/README.md new file mode 100644 index 0000000000..deadf3221d --- /dev/null +++ b/codegen/README.md @@ -0,0 +1,2 @@ +[Back to the main page](../README.md) +# Composable Kernel codegen \ No newline at end of file diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000000..43b3419f80 --- /dev/null +++ b/example/README.md @@ -0,0 +1,2 @@ +[Back to the main page](../README.md) +# Composable Kernel examples \ No newline at end of file diff --git a/include/ck/README.md b/include/ck/README.md new file mode 100644 index 0000000000..bff689f6b0 --- /dev/null +++ b/include/ck/README.md @@ -0,0 +1,19 @@ +[Back to the main page](../../README.md) +# Composable Kernel supported operations +## Supported device operations +* [Average pooling]() +* [Batched contraction]() +* [Batched gemm]() +* [Batchnorm]() +* [CGEMM]() +* [Contraction]() +* [Convolution]() +* [Image to Column and Column to Image]() +* [Elementwise]() +* [GEMM]() +* [Max pooling]() +* [Reduce]() +* [Normalization]() +* [Permute]() +* [Put]() +* [Softmax]() diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md index 572e9c7e48..9f88af1ca1 100644 --- a/include/ck_tile/README.md +++ b/include/ck_tile/README.md @@ -1,4 +1,5 @@ -# ck_tile +[Back to the main page](../../README.md) +# Composable Kernel Tile ## concept `ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time. diff --git a/profiler/README.md b/profiler/README.md index 10febcabdc..3f4837aada 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -1,3 +1,5 @@ +[Back to the main page](../README.md) +# Composable Kernel profiler ## Profile GEMM kernels ```bash #arg1: tensor operation (gemm=GEMM) @@ -180,3 +182,13 @@ Note: Column to image kernel adds to the output memory, this will cause output b ################ op datatype verify init log time dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2 ./bin/ckProfiler permute_scale 0 1 1 0 1 64 64 64 4096 64 1 1 64 4096 ``` + +## Convert MIOpen driver command to CKProfiler + +```bash +python3 ../script/convert_miopen_driver_to_profiler.py +/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3 +-p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 +``` + +Only convolution driver is supported. From 126ce85aa10347007fb5ca2068bcad378cb17d74 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Wed, 4 Dec 2024 15:59:58 +0800 Subject: [PATCH 35/52] [CK_TILE] Use 'false' for highest dimension padding flags (#1716) * Use 'false' for highest dimension padding flags * Update padding flag of bias --- .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp | 15 +++++++-------- .../kernel/fmha_fwd_splitkv_combine_kernel.hpp | 2 +- .../ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 15 +++++++-------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 3de433d6a7..3a66b78a5f 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -998,14 +998,14 @@ struct FmhaFwdKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); const auto k_dram = [&]() { @@ -1019,7 +1019,7 @@ struct FmhaFwdKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto v_dram = [&]() { if constexpr(std::is_same_v) @@ -1041,7 +1041,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -1055,7 +1055,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -1097,9 +1097,8 @@ struct FmhaFwdKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index ca9da91a5d..0bccabdd2f 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -339,7 +339,7 @@ struct FmhaFwdSplitKVCombineKernel number{}, number<1>{}); - auto o_acc_dram_view = pad_tensor_view( + const auto o_acc_dram_view = pad_tensor_view( o_acc_dram_naive, make_tuple(number<1>{}, number{}, number{}), sequence{}); diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index dcb671d81e..f37e676da0 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -623,14 +623,14 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -645,7 +645,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }; const auto k_dram = [&]() { if constexpr(kIsPagedKV) @@ -678,7 +678,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -692,7 +692,7 @@ struct FmhaFwdSplitKVKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }; const auto v_dram = [&]() { @@ -804,9 +804,8 @@ struct FmhaFwdSplitKVKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); From 4cb3d7d7eac162af2c6e1a1d9c3367cb7633347c Mon Sep 17 00:00:00 2001 From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com> Date: Wed, 4 Dec 2024 21:40:01 +0100 Subject: [PATCH 36/52] Ck tile grouped GEMM example (#1713) * Ck-tile, impl. grouped gemm * Workspace is allocated by user, and is passed to the function * Prepare test to new api design * Unify GemTransKernelArgs, removing N0 param * Add 1 to dim3 in paritioner * Typo: gem - > gemm --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 2 + example/ck_tile/17_grouped_gemm/README.md | 25 ++ .../ck_tile/17_grouped_gemm/grouped_gemm.cpp | 151 +++++++++ .../ck_tile/17_grouped_gemm/grouped_gemm.hpp | 53 +++ .../run_grouped_gemm_example.inc | 191 +++++++++++ example/ck_tile/17_grouped_gemm/utils.hpp | 38 +++ example/ck_tile/CMakeLists.txt | 1 + .../core/utility/amd_address_space.hpp | 37 +++ include/ck_tile/ops/gemm.hpp | 1 + .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 36 ++ .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 310 ++++++++++++++++++ test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/grouped_gemm/CMakeLists.txt | 4 + .../grouped_gemm/test_grouped_gemm.cpp | 29 ++ .../test_grouped_gemm_ut_cases.inc | 25 ++ .../grouped_gemm/test_grouped_gemm_util.hpp | 282 ++++++++++++++++ 16 files changed, 1186 insertions(+) create mode 100644 example/ck_tile/17_grouped_gemm/CMakeLists.txt create mode 100644 example/ck_tile/17_grouped_gemm/README.md create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.cpp create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.hpp create mode 100644 example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc create mode 100644 example/ck_tile/17_grouped_gemm/utils.hpp create mode 100644 include/ck_tile/core/utility/amd_address_space.hpp create mode 100644 include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp create mode 100644 test/ck_tile/grouped_gemm/CMakeLists.txt create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm.cpp create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt new file mode 100644 index 0000000000..d34013dd6c --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) + diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md new file mode 100644 index 0000000000..d1a0458eda --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/README.md @@ -0,0 +1,25 @@ +# Grouped CShuffle GEMM + +This folder contains example for Grouped GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ +# The basic pipeline method on the gemm calculation +make tile_example_grouped_gemm -j +``` +This will result in an executable `build/bin/tile_example_grouped_gemm` + +## example +``` +args: + -a_layout Tensor A layout (default:R) + -b_layout Tensor B layout (default:R) + -c_layout Tensor C layout (default:R) + -v 0. No validation, 1. Validation on CPU + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) +``` diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp new file mode 100644 index 0000000000..14f3b4a5b8 --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "grouped_gemm.hpp" +#include "utils.hpp" + +namespace { + +struct GroupedGemmKernelParam +{ + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; + static const bool kTilePermute = false; + + static const ck_tile::index_t kOutputRank = 2; + + static const int kBlockPerCu = 1; + static const ck_tile::index_t M_Tile = 128; + static const ck_tile::index_t N_Tile = 128; + static const ck_tile::index_t K_Tile = 32; + + static const ck_tile::index_t M_Warp = 2; + static const ck_tile::index_t N_Warp = 2; + static const ck_tile::index_t K_Warp = 1; + + static const ck_tile::index_t M_Warp_Tile = 32; + static const ck_tile::index_t N_Warp_Tile = 32; + static const ck_tile::index_t K_Warp_Tile = 8; +}; + +using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + +using TilePartitioner = ck_tile::GemmTile1DPartitioner; + +template +using GemmEpilogue = std::conditional_t< + std::is_same_v, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue>>; + +template +using CodegenGemmTraits = ck_tile::TileGemmTraits; + +template +using CodegenPipelineProblem = + ck_tile::GemmPipelineProblem>; + +using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + +template +using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1, + CodegenGemmPolicy>; + +template +using Kernel = ck_tile::GroupedGemmKernel, + GemmEpilogue>; +}; // namespace + +std::size_t GetWorkspaceSize(const std::vector& gemm_descs) +{ + return ::Kernel::GetWorkSpaceSize(gemm_descs); +} + +template +float grouped_gemm(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_) +{ + using GroupedGemmKernel = ::Kernel; + + auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs); + + const dim3 grids = GroupedGemmKernel::GridSize(gemm_descs); + constexpr dim3 blocks = GroupedGemmKernel::BlockSize(); + + ck_tile::hip_check_error(hipMemcpyWithStream( + p_workspace_, + arguments.data(), + arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg), + hipMemcpyHostToDevice, + s.stream_id_)); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = + ck_tile::launch_kernel(s, + ck_tile::make_kernel( + GroupedGemmKernel{}, + grids, + blocks, + 0, + ck_tile::cast_pointer_to_constant_address_space(p_workspace_), + gemm_descs.size())); + return ave_time; +} + +#include "run_grouped_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp new file mode 100644 index 0000000000..94af4711d1 --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" + +template +struct GemmBasicTypeConfig; + +template <> +struct GemmBasicTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using CDataType = ck_tile::half_t; + using AccDataType = float; +}; + +using Types = GemmBasicTypeConfig; + +// Specific type aliases for easy access +using ADataType = Types::ADataType; +using BDataType = Types::BDataType; +using AccDataType = Types::AccDataType; +using CDataType = Types::CDataType; + +using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("validate", "1", "0. No validation, 1. Validation on CPU") + .insert("warmup", "10", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("group_count", "16", "group count"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +std::size_t GetWorkspaceSize(const std::vector& gemm_descs); + +float grouped_gemm_calc(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_); diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc new file mode 100644 index 0000000000..cd5b1c2864 --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +float invoke_gemm(int n_warmup, + int n_repeat, + int group_count, + const std::vector& args) +{ + + ck_tile::DeviceMem gemm_workspace; + gemm_workspace.Realloc(GetWorkspaceSize(args)); + + float ave_time = grouped_gemm( + args, + ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}, + gemm_workspace.GetDeviceBuffer()); + + std::string op_name{"Grouped Gemm"}; + + std::size_t flop = 0, num_btype = 0; + for(int j = 0; j < group_count; ++j) + { + flop += std::size_t(2) * args[j].M * args[j].N * args[j].K; + + num_btype += sizeof(ADataType) * args[j].M * args[j].K + + sizeof(BDataType) * args[j].K * args[j].N + + sizeof(CDataType) * args[j].M * args[j].N; + } + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + return ave_time; +} + +template +int run_grouped_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + + if(!result) + { + return -1; + }; + + const int group_count = arg_parser.get_int("group_count"); + const int repeat = arg_parser.get_int("repeat"); + const int warmup = arg_parser.get_int("warmup"); + + std::vector Ms; + std::vector Ns; + std::vector Ks; + std::vector stride_As; + std::vector stride_Bs; + std::vector stride_Cs; + + for(int i = 0; i < group_count; i++) + { + Ms.push_back(256 + 256 * i); + Ns.push_back(128 + 128 * i); + Ks.push_back(128 + 64 * i); + + stride_As.push_back(Ks[i]); + stride_Bs.push_back(Ks[i]); + stride_Cs.push_back(Ns[i]); + } + + std::vector> a_m_k_tensors; + std::vector> b_k_n_tensors; + std::vector> c_m_n_tensors; + + a_m_k_tensors.reserve(group_count); + b_k_n_tensors.reserve(group_count); + c_m_n_tensors.reserve(group_count); + + std::vector> a_m_k_dev_buf; + std::vector> b_k_n_dev_buf; + std::vector> c_m_n_dev_buf; + + a_m_k_dev_buf.reserve(group_count); + b_k_n_dev_buf.reserve(group_count); + c_m_n_dev_buf.reserve(group_count); + + std::vector gemm_descs; + gemm_descs.reserve(group_count); + + for(int i = 0; i < group_count; ++i) + { + const ck_tile::index_t M = Ms[i]; + const ck_tile::index_t N = Ns[i]; + const ck_tile::index_t K = Ks[i]; + + stride_As[i] = f_get_default_stride(M, N, stride_As[i], a_layout); + stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], b_layout); + stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); + + a_m_k_tensors.push_back( + ck_tile::HostTensor(f_host_tensor_descriptor(M, K, stride_As[i], a_layout))); + b_k_n_tensors.push_back( + ck_tile::HostTensor(f_host_tensor_descriptor(K, N, stride_Bs[i], b_layout))); + c_m_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); + + std::cout << "gemm[" << i << "]" + << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc + << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl; + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k_tensors[i]); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n_tensors[i]); + + a_m_k_dev_buf.push_back(std::make_unique( + a_m_k_tensors[i].get_element_space_size_in_bytes())); + b_k_n_dev_buf.push_back(std::make_unique( + b_k_n_tensors[i].get_element_space_size_in_bytes())); + c_m_n_dev_buf.push_back(std::make_unique( + c_m_n_tensors[i].get_element_space_size_in_bytes())); + + a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data()); + b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data()); + c_m_n_dev_buf[i]->SetZero(); + c_m_n_tensors[i].SetZero(); + + const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer(); + const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer(); + void* p_c = c_m_n_dev_buf[i]->GetDeviceBuffer(); + + gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]}); + } + + invoke_gemm(warmup, repeat, group_count, gemm_descs); + + for(int i = 0; i < group_count; i++) + { + c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data()); + } + + bool pass{true}; + if(arg_parser.get_int("validate")) + { + for(int i = 0; i < group_count; ++i) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); + c_m_n_host_ref.SetZero(); + ck_tile::reference_gemm( + a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); + pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref); + } + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_grouped_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + { + return -1; + } + + const std::string a_layout = arg_parser.get_str("a_layout"); + const std::string b_layout = arg_parser.get_str("b_layout"); + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + if(a_layout == "R" && b_layout == "C") + { + return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + else if(a_layout == "R" && b_layout == "R") + { + return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/example/ck_tile/17_grouped_gemm/utils.hpp b/example/ck_tile/17_grouped_gemm/utils.hpp new file mode 100644 index 0000000000..bb3cdf9fdc --- /dev/null +++ b/example/ck_tile/17_grouped_gemm/utils.hpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +constexpr auto +f_host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + using namespace ck_tile::literals; + + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } +} +template +constexpr auto +f_get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + if(stride == 0) + { + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 51ebb5bf07..296eb1ecef 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -16,3 +16,4 @@ add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) add_subdirectory(15_fused_moe) add_subdirectory(16_batched_gemm) +add_subdirectory(17_grouped_gemm) diff --git a/include/ck_tile/core/utility/amd_address_space.hpp b/include/ck_tile/core/utility/amd_address_space.hpp new file mode 100644 index 0000000000..cb242bf0d5 --- /dev/null +++ b/include/ck_tile/core/utility/amd_address_space.hpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +// Address Space for AMDGCN +// https://llvm.org/docs/AMDGPUUsage.html#address-space + +namespace ck_tile { + +#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) + +template +__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) +{ + // cast a pointer in "Constant" address space (4) to "Generic" address space (0) + // only c-style pointer cast seems be able to be compiled +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T*)p; // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + +template +__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) +{ + // cast a pointer in "Generic" address space (0) to "Constant" address space (4) + // only c-style pointer cast seems be able to be compiled +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index b9eb248581..82d35b9c59 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,7 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 6387233c0f..8ffe681f90 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -35,4 +35,40 @@ struct GemmTilePartitioner return make_tuple(iM, iN); } }; + +template +struct GemmTile1DPartitioner +{ + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N) + { + index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; + index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; + return dim3(GridDimX * GridDimY, 1, 1); + } + + CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N) + { + return integer_divide_ceil(N, NPerBlock); + } + + CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) + { + return integer_divide_ceil(K, KPerBlock); + } + + CK_TILE_DEVICE auto operator()(index_t blockOffset, index_t NBlockSize) + { + index_t iM = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) / + GetNBlock(NBlockSize) * MPerBlock); + index_t iN = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) % + GetNBlock(NBlockSize) * NPerBlock); + return make_tuple(iM, iN); + } +}; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp new file mode 100644 index 0000000000..f24fc47afc --- /dev/null +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core/numeric/math.hpp" +#include "ck_tile/core/utility/literals.hpp" +#include "ck_tile/core/utility/amd_address_space.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/host.hpp" + +namespace ck_tile { + +struct GroupedGemmHostArgs +{ + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; +}; + +template +struct GroupedGemmKernel +{ + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + struct GemmTransKernelArg + { + GroupedGemmHostArgs group_karg; + ck_tile::index_t block_start; + ck_tile::index_t block_end; + + GemmTransKernelArg() = default; + GemmTransKernelArg(GroupedGemmHostArgs&& karg, index_t bl_start, index_t bl_end) + : group_karg{karg}, block_start{bl_start}, block_end{bl_end} + { + } + }; + + __host__ static size_t GetWorkSpaceSize(const std::vector& gemm_descs) + { + return gemm_descs.size() * sizeof(GemmTransKernelArg); + } + + __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + + using Hargs = GroupedGemmHostArgs; + + __host__ static constexpr auto GridSize(const std::vector& gemm_descs) + { + index_t grid_size = 0; + for(const auto& it_desc : gemm_descs) + { + const auto dim3 = TilePartitioner::GridSize(it_desc.M, it_desc.N); + grid_size += dim3.x * dim3.y * 1; + } + return dim3(grid_size, 1, 1); + } + + CK_TILE_HOST static auto MakeKargs(const std::vector& gemm_descs) + { + std::vector gemm_kernel_args_; + index_t group_count = ck_tile::type_convert(gemm_descs.size()); + index_t grid_size = 0; + gemm_kernel_args_.reserve(group_count); + + for(std::size_t i = 0; i < gemm_descs.size(); ++i) + { + const index_t M = gemm_descs[i].M; + const index_t N = gemm_descs[i].N; + const index_t K = gemm_descs[i].K; + + if(M == 0 || N == 0 || K == 0) + { + continue; + } + + const index_t stride_a = gemm_descs[i].stride_A; + const index_t stride_b = gemm_descs[i].stride_B; + const index_t stride_c = gemm_descs[i].stride_C; + + const auto dim3 = TilePartitioner::GridSize(M, N); + const index_t grid_size_grp = dim3.x * 1 * 1; + + const index_t block_start = grid_size; + const index_t block_end = grid_size + grid_size_grp; + + grid_size += grid_size_grp; + + auto karg = GroupedGemmHostArgs{type_convert(gemm_descs[i].a_ptr), + type_convert(gemm_descs[i].b_ptr), + type_convert(gemm_descs[i].c_ptr), + M, + N, + K, + stride_a, + stride_b, + stride_c}; + + gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end); + } + + return gemm_kernel_args_; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + } + + CK_TILE_DEVICE void Run(const Hargs& kargs, const index_t block_start) const + { + const auto [i_m, i_n] = TilePartitioner{}(block_start, kargs.N); + // options + const ADataType* a_start = static_cast(kargs.a_ptr); + const BDataType* b_start = static_cast(kargs.b_ptr); + // Convert pointers to tensor views + auto a_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + a_start, + make_tuple(kargs.M, kargs.K), + make_tuple(kargs.stride_A, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + a_start, + make_tuple(kargs.M, kargs.K), + make_tuple(1, kargs.stride_A), + number<1>{}, + number<1>{}); + } + }(); + + auto b_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + b_start, + make_tuple(kargs.N, kargs.K), + make_tuple(1, kargs.stride_B), + number<1>{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + b_start, + make_tuple(kargs.N, kargs.K), + make_tuple(kargs.stride_B, 1), + number{}, + number<1>{}); + } + }(); + + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + // clang-format on + + auto a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + + auto b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + CDataType* c_start = static_cast(kargs.c_ptr); + auto c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_start, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_start, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + else + { + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); + } + }(); + auto CBlockWindow_pad = make_tile_window( + c_pad_view, + make_tuple(number{}, number{}), + {i_m, i_n}); + + EpiloguePipeline{}(CBlockWindow_pad, c_block_tile); + } + + CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, + int group_count) const + { + const index_t block_id = ck_tile::get_block_1d_id(); + const auto gemm_desc_ptr = reinterpret_cast( + cast_pointer_to_generic_address_space(gemm_descs_const)); + + index_t left = 0; + index_t right = group_count; + index_t group_id = index_t((left + right) / 2); + + while((!(block_id >= gemm_desc_ptr[group_id].block_start && + block_id < gemm_desc_ptr[group_id].block_end)) && + left <= right) + { + if(block_id < gemm_desc_ptr[group_id].block_start) + { + right = group_id; + } + else + { + left = group_id; + } + group_id = index_t((left + right) / 2); + } + + Run(gemm_desc_ptr[group_id].group_karg, gemm_desc_ptr[group_id].block_start); + } +}; + +} // namespace ck_tile diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index fd0de0f9c1..77cf35f667 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(image_to_column) add_subdirectory(gemm) add_subdirectory(batched_gemm) +add_subdirectory(grouped_gemm) diff --git a/test/ck_tile/grouped_gemm/CMakeLists.txt b/test/ck_tile/grouped_gemm/CMakeLists.txt new file mode 100644 index 0000000000..f4845847f1 --- /dev/null +++ b/test/ck_tile/grouped_gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_grouped_gemm test_grouped_gemm.cpp) +endif() diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp new file mode 100644 index 0000000000..1bce0f8aa9 --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_grouped_gemm_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Col, Row, F16, F16, F32, F16>//, + //std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileGroupedGemm, KernelTypes); + +#include "test_grouped_gemm_ut_cases.inc" diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc new file mode 100644 index 0000000000..68c4693bb3 --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc @@ -0,0 +1,25 @@ +#pragma once + +TYPED_TEST(TestCkTileGroupedGemm, Basic) +{ + const int group_count = 16; + std::vector Ms; + std::vector Ns; + std::vector Ks; + std::vector stride_As; + std::vector stride_Bs; + std::vector stride_Cs; + + for(int i = 0; i < group_count; i++) + { + Ms.push_back(256 + 256 * i); + Ns.push_back(128 + 128 * i); + Ks.push_back(128 + 64 * i); + + stride_As.push_back(Ks[i]); + stride_Bs.push_back(Ks[i]); + stride_Cs.push_back(Ns[i]); + } + + this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, group_count); +} diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp new file mode 100644 index 0000000000..f532de21dc --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" + +template +class TestCkTileGroupedGemm : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + + struct GroupedGemKernelParam + { + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; + static const bool kTilePermute = false; + + static const ck_tile::index_t kOutputRank = 2; + + static const int kBlockPerCu = 1; + static const ck_tile::index_t M_Tile = 128; + static const ck_tile::index_t N_Tile = 128; + static const ck_tile::index_t K_Tile = 32; + + static const ck_tile::index_t M_Warp = 2; + static const ck_tile::index_t N_Warp = 2; + static const ck_tile::index_t K_Warp = 1; + + static const ck_tile::index_t M_Warp_Tile = 32; + static const ck_tile::index_t N_Warp_Tile = 32; + static const ck_tile::index_t K_Warp_Tile = 8; + }; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + + template + using GemmEpilogue = + std::conditional_t, + ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + template + using CodegenGemmTraits = ck_tile::TileGemmTraits; + + template + using CodegenPipelineProblem = + ck_tile::GemmPipelineProblem>; + + using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy; + + template + using CodegenGemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1, + CodegenGemmPolicy>; + + template + using Kernel = ck_tile::GroupedGemmKernel, + GemmEpilogue>; + + using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; + std::size_t GetWorkspaceSize(const std::vector& gemm_descs) + { + return Kernel::GetWorkSpaceSize(gemm_descs); + } + + template + void invoke_grouped_gemm(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_) + { + using GroupedGemmKernel = Kernel; + + auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs); + + const dim3 grids = GroupedGemmKernel::GridSize(gemm_descs); + constexpr dim3 blocks = GroupedGemmKernel::BlockSize(); + + ck_tile::hip_check_error(hipMemcpyWithStream( + p_workspace_, + arguments.data(), + arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg), + hipMemcpyHostToDevice, + s.stream_id_)); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + ck_tile::launch_kernel(s, + ck_tile::make_kernel( + GroupedGemmKernel{}, + grids, + blocks, + 0, + ck_tile::cast_pointer_to_constant_address_space(p_workspace_), + gemm_descs.size())); + } + + public: + void Run(const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + std::vector& stride_As, + std::vector& stride_Bs, + std::vector& stride_Cs, + const int group_count = 16) + { + using namespace ck_tile::literals; + auto f_host_tensor_descriptor = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + std::vector> a_m_k_tensors; + std::vector> b_k_n_tensors; + std::vector> c_m_n_tensors; + + a_m_k_tensors.reserve(group_count); + b_k_n_tensors.reserve(group_count); + c_m_n_tensors.reserve(group_count); + + std::vector> a_m_k_dev_buf; + std::vector> b_k_n_dev_buf; + std::vector> c_m_n_dev_buf; + + a_m_k_dev_buf.reserve(group_count); + b_k_n_dev_buf.reserve(group_count); + c_m_n_dev_buf.reserve(group_count); + + std::vector gemm_descs; + gemm_descs.reserve(group_count); + + for(int i = 0; i < group_count; ++i) + { + const ck_tile::index_t M = Ms[i]; + const ck_tile::index_t N = Ns[i]; + const ck_tile::index_t K = Ks[i]; + + stride_As[i] = f_get_default_stride(M, N, stride_As[i], ALayout{}); + stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{}); + stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); + + a_m_k_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, K, stride_As[i], ALayout{}))); + b_k_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(K, N, stride_Bs[i], BLayout{}))); + c_m_n_tensors.push_back(ck_tile::HostTensor( + f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); + + std::cout << "gemm[" << i << "]" + << " a_m_k: " << a_m_k_tensors[i].mDesc + << " b_k_n: " << b_k_n_tensors[i].mDesc + << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl; + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k_tensors[i]); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n_tensors[i]); + + a_m_k_dev_buf.push_back(std::make_unique( + a_m_k_tensors[i].get_element_space_size_in_bytes())); + b_k_n_dev_buf.push_back(std::make_unique( + b_k_n_tensors[i].get_element_space_size_in_bytes())); + c_m_n_dev_buf.push_back(std::make_unique( + c_m_n_tensors[i].get_element_space_size_in_bytes())); + + a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data()); + b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data()); + c_m_n_dev_buf[i]->SetZero(); + c_m_n_tensors[i].SetZero(); + + const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer(); + const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer(); + void* p_c = c_m_n_dev_buf[i]->GetDeviceBuffer(); + + gemm_descs.push_back( + {p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]}); + } + + ck_tile::DeviceMem gemm_workspace; + gemm_workspace.Realloc(GetWorkspaceSize(gemm_descs)); + + invoke_grouped_gemm( + gemm_descs, ck_tile::stream_config{nullptr, false}, gemm_workspace.GetDeviceBuffer()); + + for(int i = 0; i < group_count; i++) + { + c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data()); + } + + bool pass{true}; + for(int i = 0; i < group_count; ++i) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); + c_m_n_host_ref.SetZero(); + ck_tile::reference_gemm( + a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); + pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref); + } + EXPECT_TRUE(pass); + } +}; From d2d1d177ffe04f0ff25fed0aedcb3ede0e07c51b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:05:47 -0800 Subject: [PATCH 37/52] Bump rocm-docs-core from 1.10.0 to 1.11.0 in /docs/sphinx (#1720) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.10.0 to 1.11.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.10.0...v1.11.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 9969824d25..d1b3465b9c 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.10.0 +rocm-docs-core==1.11.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index bb731db2dd..26d0aa2446 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.10.0 +rocm-docs-core==1.11.0 # via -r requirements.in six==1.16.0 # via pybtex From feb9a2bd9b50da9d449e5931e936d527a0db89fe Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 5 Dec 2024 09:02:13 +0100 Subject: [PATCH 38/52] Add IsSupportedArgument() to gemm_kernel (#1698) * add IsSupportedArgument to gemm_kernel * add ut and do some refactoring * switched to ck_tile's integral_constant --- example/ck_tile/03_gemm/gemm_basic.cpp | 5 ++ example/ck_tile/03_gemm/universal_gemm.cpp | 5 ++ .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 73 +++++++++++++++++++ test/ck_tile/gemm/test_gemm_mem_pipeline.cpp | 42 +++++------ .../gemm/test_gemm_mem_pipeline_ut_cases.inc | 61 ++++------------ .../gemm/test_gemm_mem_pipeline_util.hpp | 22 ++++-- 6 files changed, 129 insertions(+), 79 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index b7d8693442..f5260c306e 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -92,6 +92,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index eaafc13b98..6c87ca0087 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -119,6 +119,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 96af6e8260..763d8cad9c 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -66,6 +66,79 @@ struct GemmKernel return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } + CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs) + { + if constexpr(std::is_same_v) + { + if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + { + return false; + } + if(kargs.K % GemmPipeline::VectorSizeA != 0) + { + return false; + } + } + else + { + if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + { + return false; + } + if(kargs.M % GemmPipeline::VectorSizeA != 0) + { + return false; + } + } + + if constexpr(std::is_same_v) + { + if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + { + return false; + } + if(kargs.N % GemmPipeline::VectorSizeB != 0) + { + return false; + } + } + else + { + if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + { + return false; + } + if(kargs.K % GemmPipeline::VectorSizeB != 0) + { + return false; + } + } + + if constexpr(std::is_same_v) + { + if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + { + return false; + } + if(kargs.N % GemmPipeline::VectorSizeC != 0) + { + return false; + } + } + else + { + if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + { + return false; + } + if(kargs.M % GemmPipeline::VectorSizeC != 0) + { + return false; + } + } + return true; + } + CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const { const auto [i_m, i_n] = TilePartitioner{}(); diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp index a1c80fee4b..aeb383c87d 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp @@ -8,35 +8,29 @@ #include "ck_tile/host.hpp" #include "test_gemm_mem_pipeline_util.hpp" -using F16 = ck_tile::half_t; -using F32 = float; - -using Row = ck_tile::tensor_layout::gemm::RowMajor; -using Col = ck_tile::tensor_layout::gemm::ColumnMajor; -static constexpr auto Intrawave = ck_tile::GemmPipelineScheduler::Intrawave; -static constexpr auto Interwave = ck_tile::GemmPipelineScheduler::Interwave; - -template -class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline -{ -}; - -template -class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline -{ -}; +using F16 = ck_tile::half_t; +using F32 = float; +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; +using Intrawave = ck_tile::integral_constant; +using Interwave = ck_tile::integral_constant; // clang-format off using KernelTypes = ::testing::Types< - // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType - std::tuple< Row, Col, Row, F16, F16, F32, F16>, - std::tuple< Col, Row, Row, F16, F16, F32, F16>, - std::tuple< Row, Row, Row, F16, F16, F32, F16>, - std::tuple< Col, Col, Row, F16, F16, F32, F16> + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler + std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave> >; // clang-format on -TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes); -TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes); +TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes); #include "test_gemm_mem_pipeline_ut_cases.inc" diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc index 6b914e7975..af94d68f2c 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc @@ -3,11 +3,7 @@ #pragma once -//------------------------------------------------------------------------------------------------ -// INTERWAVE SCHEDULER -//------------------------------------------------------------------------------------------------ - -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) +TYPED_TEST(TestCkTileGemmMemPipeline, SmallM) { std::vector Ms{1, 2, 3, 4, 5, 6}; constexpr int N = 1024; @@ -17,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) +TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM) { std::vector Ms{127, 255, 312, 799, 1573}; constexpr int N = 1024; @@ -27,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) +TYPED_TEST(TestCkTileGemmMemPipeline, PaddK) { std::vector Ms{127}; constexpr int N = 1024; @@ -37,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK) this->Run(M, N, K); } -TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) +TYPED_TEST(TestCkTileGemmMemPipeline, Regular) { std::vector Ms{512}; constexpr int N = 1024; @@ -47,46 +43,15 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular) this->Run(M, N, K); } -//------------------------------------------------------------------------------------------------ -// INTRAWAVE SCHEDULER -//------------------------------------------------------------------------------------------------ - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM) +TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument) { - std::vector Ms{1, 2, 3, 4, 5, 6}; - constexpr int N = 1024; - constexpr int K = 320; + constexpr int M = 512; + constexpr int N = 1025; + constexpr int K = 513; - for(int M : Ms) - this->Run(M, N, K); -} - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM) -{ - std::vector Ms{127, 255, 312, 799, 1573}; - constexpr int N = 1024; - constexpr int K = 320; - - for(int M : Ms) - this->Run(M, N, K); -} - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK) -{ - std::vector Ms{127}; - constexpr int N = 1024; - constexpr int K = 432; - - for(int M : Ms) - this->Run(M, N, K); -} - -TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular) -{ - std::vector Ms{512}; - constexpr int N = 1024; - constexpr int K = 512; - - for(int M : Ms) - this->Run(M, N, K); + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + + EXPECT_THROW((this->template Run(M, N, K)), std::runtime_error); } diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp index 15f9f516ee..6941a7596a 100644 --- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp @@ -11,7 +11,7 @@ #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -template +template class TestCkTileGemmMemPipeline : public ::testing::Test { protected: @@ -22,7 +22,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test using BDataType = std::tuple_element_t<4, Tuple>; using AccDataType = std::tuple_element_t<5, Tuple>; using CDataType = std::tuple_element_t<6, Tuple>; - static constexpr auto Scheduler = Scheduler_; + static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value; // TODO: expose tile size through test t-param ? struct gemm_args @@ -39,6 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test ck_tile::index_t stride_C; }; + template void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s) { // TODO: This should be parameterized in tests @@ -54,9 +55,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - constexpr bool kPadM = true; - constexpr bool kPadN = true; - constexpr bool kPadK = true; + constexpr bool kPadM = PadM; + constexpr bool kPadN = PadN; + constexpr bool kPadK = PadK; constexpr int kBlockPerCu = 1; @@ -107,6 +108,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test const dim3 grids = Kernel::GridSize(args.M, args.N, args.kbatch); constexpr dim3 blocks = Kernel::BlockSize(); + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + if(s.log_level_ > 0) { std::cout << "Launching kernel with args:" @@ -212,6 +218,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test void SetUp() override { k_batches_ = {1}; } + template void Run(const int M, const int N, const int K, @@ -221,10 +228,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test { for(auto kb : k_batches_) { - RunSingle(M, N, K, StrideA, StrideB, StrideC, kb); + RunSingle(M, N, K, StrideA, StrideB, StrideC, kb); } } + template void RunSingle(const int M, const int N, const int K, @@ -301,7 +309,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test args.stride_B = stride_B; args.stride_C = stride_C; - invoke_gemm(args, ck_tile::stream_config{nullptr, false}); + invoke_gemm(args, ck_tile::stream_config{nullptr, false}); c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool pass = true; From 86990558e39a99d3e2dd909e45f5d38c3b13d956 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:29:12 -0800 Subject: [PATCH 39/52] Upgrade default compiler to ROCm6.3 (#1723) * upgrade to rocm6.3 compiler * Proposed solution to convnd test failures in ROCm 6.3 --------- Co-authored-by: Andriy Roshchenko --- Dockerfile | 13 ++++-------- Dockerfile.compiler | 2 +- Jenkinsfile | 21 ++++++++++--------- .../convscale/convnd_fwd_convscale_common.hpp | 9 ++++---- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index f9b7d76e3b..6689ae08ff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=6.2 +ARG ROCMVERSION=6.3 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" @@ -13,17 +13,12 @@ RUN set -xe && \ apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN if [ "$ROCMVERSION" != "6.3" ]; then \ - sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ +RUN if [ "$ROCMVERSION" != "6.4" ]; then \ + sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \ + apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ - elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \ - sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \ - sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \ - amdgpu-repo --amdgpu-build=2074281; \ fi RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ diff --git a/Dockerfile.compiler b/Dockerfile.compiler index 354b71f692..3f33290929 100644 --- a/Dockerfile.compiler +++ b/Dockerfile.compiler @@ -1,4 +1,4 @@ -ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3" FROM $BASE_DOCKER ARG compiler_version="" ARG compiler_commit="" diff --git a/Jenkinsfile b/Jenkinsfile index f8493fa2f6..58cd72c8ce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -38,13 +38,14 @@ def getBaseDockerImageName(){ img = "${params.USE_CUSTOM_DOCKER}" } else{ - if (params.ROCMVERSION != "6.3"){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + def ROCM_numeric = "${params.ROCMVERSION}" as float + if ( ROCM_numeric < 6.4 ){ + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + } + else{ + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + } } - else{ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" - } - } return img } @@ -739,8 +740,8 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true - 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true + 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false @@ -765,8 +766,8 @@ pipeline { description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( name: 'ROCMVERSION', - defaultValue: '6.2', - description: 'Specify which ROCM version to use: 6.2 (default).') + defaultValue: '6.3', + description: 'Specify which ROCM version to use: 6.3 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', diff --git a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp index 978221f8e1..bf560f8a43 100644 --- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp +++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp @@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification, { case 0: break; case 1: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + // values generated: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 + in.GenerateTensorValue(GeneratorTensor_2{-5, 6}); + wei.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); break; default: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); } DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); From 58e7f37fc892c1e7aeca338f96ec694712e6e412 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Fri, 6 Dec 2024 12:59:58 +0800 Subject: [PATCH 40/52] Undo padding-flag changes in fmha_fwd_kernel.hpp (#1725) --- .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 3a66b78a5f..3de433d6a7 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -998,14 +998,14 @@ struct FmhaFwdKernel return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); const auto k_dram = [&]() { @@ -1019,7 +1019,7 @@ struct FmhaFwdKernel return pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto v_dram = [&]() { if constexpr(std::is_same_v) @@ -1041,7 +1041,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_transposed, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { @@ -1055,7 +1055,7 @@ struct FmhaFwdKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); @@ -1097,8 +1097,9 @@ struct FmhaFwdKernel number{}, number<1>{}); - return pad_tensor_view( - bias_dram_naive, bias_dram_window_lengths, sequence{}); + return pad_tensor_view(bias_dram_naive, + bias_dram_window_lengths, + sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); From 261f1759de15fd319ba03985ebe7123fae12a722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 6 Dec 2024 10:55:23 +0100 Subject: [PATCH 41/52] Support large batch tensors in grouped conv bwd data (#1711) * Support large batch tensors in grouped conv bwd data * Fix multiD * fixes * fixes * fixes --- ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 184 +-- ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 290 ++-- .../transform_conv_bwd_data_to_gemm_v1.hpp | 1309 ++++++++++------- test/grouped_convnd_bwd_data/CMakeLists.txt | 8 +- .../test_grouped_convnd_bwd_data_wmma.cpp | 108 ++ ...p => test_grouped_convnd_bwd_data_xdl.cpp} | 39 +- 6 files changed, 1081 insertions(+), 857 deletions(-) create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp rename test/grouped_convnd_bwd_data/{test_grouped_convnd_bwd_data_xdl_wmma.cpp => test_grouped_convnd_bwd_data_xdl.cpp} (78%) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index 3fb047f207..359711e5c4 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle static constexpr auto I3 = Number<3>{}; static constexpr index_t KPerBlock = K0PerBlock * K1; - static constexpr auto transform_conv_to_gemm = - TransformConvBwdDataToGemm_v1{}; + using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1; - static auto GetDummyABDsEGridDescriptor() + static auto + GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform) { - const std::array dummy_tensor_lengths = {1}; - const std::array dummy_tensor_strides = {1}; - const std::array dummy_spatial_lengths = {1}; - - const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto ds_grid_desc_m_n = generate_tuple( - [&](auto i) { - using DLayout = remove_cvref_t>; - - return transform_conv_to_gemm.template MakeCDescriptor_M_N( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - }, - Number{}); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N(dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - + const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1(); + const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1(); + const auto ds_grid_desc_m_n = + generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); }, + Number{}); + const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N(); return make_tuple( a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n); } // desc - using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor()); + constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform; + using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform)); using AGridDesc_AK0_M_AK1 = remove_cvref_t>; using BGridDesc_BK0_N_BK1 = remove_cvref_t>; @@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle const std::array& b_g_k_c_xs_lengths, const std::array& b_g_k_c_xs_strides, const std::array, NumDTensor>& - ds_g_n_c_wis_lengths, + /*ds_g_n_c_wis_lengths*/, const std::array, NumDTensor>& ds_g_n_c_wis_strides, const std::array& e_g_n_c_wis_lengths, @@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle b_element_op_{b_element_op}, cde_element_op_{cde_element_op}, a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths}, - a_g_n_k_wos_strides_{a_g_n_k_wos_strides}, b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, - b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, - ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths}, - ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides}, - e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths}, - e_g_n_c_wis_strides_{e_g_n_c_wis_strides}, conv_filter_strides_{conv_filter_strides}, - conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle tildes = {i_ztilde, i_ytilde, i_xtilde}; } + ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths, + a_g_n_k_wos_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_c_wis_lengths, + e_g_n_c_wis_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + tildes}; + const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1(); const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1(); DsGridDesc_M_N ds_grid_desc_m_n; // populate Ds desc static_for<0, NumDTensor, 1>{}([&](auto i) { using DLayout = remove_cvref_t>; - - ds_grid_desc_m_n(i) = - transform_conv_to_gemm.template MakeCDescriptor_M_N( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - ds_g_n_c_wis_lengths[i], - ds_g_n_c_wis_strides[i], - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); - }); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N( + static_assert(is_same_v); + ConvToGemmBwdDataTransform conv_to_gemm_transform_d{ a_g_n_k_wos_lengths, a_g_n_k_wos_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, + ds_g_n_c_wis_strides[i], conv_filter_strides, conv_filter_dilations, input_left_pads, input_right_pads, - tildes); + tildes}; + + ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N(); + }); + + const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N(); // for check validity ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n); @@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle BElementwiseOp b_element_op_; CDEElementwiseOp cde_element_op_; - // for checking IsSupportedArgument() std::array a_g_n_k_wos_lengths_; - std::array a_g_n_k_wos_strides_; std::array b_g_k_c_xs_lengths_; - std::array b_g_k_c_xs_strides_; - std::array, NumDTensor> ds_g_n_c_wis_lengths_; - std::array, NumDTensor> ds_g_n_c_wis_strides_; - std::array e_g_n_c_wis_lengths_; - std::array e_g_n_c_wis_strides_; std::array conv_filter_strides_; - std::array conv_filter_dilations_; std::array input_left_pads_; std::array input_right_pads_; }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index b544c925e1..c8c58d5d85 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -54,15 +54,16 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS @@ -73,10 +74,9 @@ __global__ void const ABDataType* __restrict__ p_b_grid, DsPointer p_ds_grid, EDataType* __restrict__ p_e_grid, - const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, - const CDEElementwiseOperation cde_element_op, - const index_t batch_count, + const AElementwiseOp a_element_op, + const BElementwiseOp b_element_op, + const CDEElementwiseOp cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock @@ -84,24 +84,29 @@ __global__ void const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2ETileMap block_2_ctile_map, - const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, + const ComputePtrOffsetOfN compute_ptr_offset_of_n) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ defined(__gfx94__)) // offset base pointer for each work-group - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z); + const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y); - const long_index_t a_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); - const long_index_t b_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); - const long_index_t e_batch_offset = amd_wave_read_first_lane( - static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + const long_index_t a_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); + const long_index_t e_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; DsPointer p_ds_grid_grp; @@ -112,10 +117,10 @@ __global__ void static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); - GridwiseGemm::template Run(p_a_grid + a_batch_offset, + GridwiseGemm::template Run(p_a_grid + a_batch_offset + a_n_offset, p_b_grid + b_batch_offset, p_ds_grid_grp, - p_e_grid + e_batch_offset, + p_e_grid + e_batch_offset + e_n_offset, p_shared, a_element_op, b_element_op, @@ -130,7 +135,6 @@ __global__ void ignore = p_b_grid; ignore = p_ds_grid; ignore = p_e_grid; - ignore = batch_count; ignore = a_grid_desc_ak0_m_ak1; ignore = b_grid_desc_bk0_n_bk1; ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; @@ -139,6 +143,7 @@ __global__ void ignore = b_element_op; ignore = cde_element_op; ignore = compute_ptr_offset_of_batch; + ignore = compute_ptr_offset_of_n; ignore = block_2_ctile_map; #endif } @@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; - static constexpr auto transform_conv_to_gemm = - TransformConvBwdDataToGemm_v1{}; + using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1; - static auto GetDummyABDsEGridDescriptor() + static auto + GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform) { - const std::array dummy_tensor_lengths = {1}; - const std::array dummy_tensor_strides = {1}; - const std::array dummy_spatial_lengths = {1}; + const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1(); - const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); - - const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1(); const auto ds_grid_desc_m_n = generate_tuple( [&](auto i) { - using DLayout = remove_cvref_t>; - - return transform_conv_to_gemm.template MakeCDescriptor_M_N( - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + using ConvToGemmBwdDataTransformD = + TransformConvBwdDataToGemm_v1; + return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N(); }, Number{}); - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N(dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_tensor_lengths, - dummy_tensor_strides, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths, - dummy_spatial_lengths); + const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N(); return make_tuple( a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n); @@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 } // desc - using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor()); + constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform; + using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform)); using AGridDesc_AK0_M_AK1 = remove_cvref_t>; using BGridDesc_BK0_N_BK1 = remove_cvref_t>; @@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 b_element_op_{b_element_op}, cde_element_op_{cde_element_op}, a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths}, - a_g_n_k_wos_strides_{a_g_n_k_wos_strides}, b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, - b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, - ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths}, - ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides}, - e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths}, - e_g_n_c_wis_strides_{e_g_n_c_wis_strides}, conv_filter_strides_{conv_filter_strides}, - conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 p_ds_grid_(i) = static_cast(p_ds[i]); }); - // A/B/Ds/E Batch Stride - compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0]; - compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; - compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0]; - static_for<0, NumDTensor, 1>{}([&](auto i) { compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0]; }); @@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 throw std::runtime_error("wrong! only implemented for 2D and 3D now"); } + ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths, + a_g_n_k_wos_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_c_wis_lengths, + e_g_n_c_wis_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + tildes}; + + conv_N_per_block_ = conv_to_gemm_transform_.N_; + const auto a_grid_desc_ak0_m_ak1 = - transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1(); const auto b_grid_desc_bk0_n_bk1 = - transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); + conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1(); DsGridDesc_M_N ds_grid_desc_m_n; // populate Ds desc static_for<0, NumDTensor, 1>{}([&](auto i) { - using DLayout = remove_cvref_t>; - - ds_grid_desc_m_n(i) = - transform_conv_to_gemm.template MakeCDescriptor_M_N( - a_g_n_k_wos_lengths, - a_g_n_k_wos_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - ds_g_n_c_wis_lengths[i], - ds_g_n_c_wis_strides[i], - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - tildes); - }); - - const auto e_grid_desc_m_n = - transform_conv_to_gemm.template MakeCDescriptor_M_N( + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + using ConvToGemmBwdDataTransformD = + TransformConvBwdDataToGemm_v1; + ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{ a_g_n_k_wos_lengths, a_g_n_k_wos_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, - e_g_n_c_wis_lengths, - e_g_n_c_wis_strides, + ds_g_n_c_wis_lengths[i], + ds_g_n_c_wis_strides[i], conv_filter_strides, conv_filter_dilations, input_left_pads, input_right_pads, - tildes); + tildes}; + + ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N(); + }); + + const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N(); // desc for problem definition const auto a_grid_desc_m_k = @@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 } } } + // A/B/Ds/E Batch Stride + compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0]; + + compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_; + compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_; } void Print() const @@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 // tensor descriptor for problem definition index_t num_group_; + index_t conv_N_per_block_; std::vector a_grid_desc_m_k_container_; std::vector b_grid_desc_n_k_container_; std::vector ds_grid_desc_m_n_container_; @@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 // for computing batch offset ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_n_; // element-wise op AElementwiseOp a_element_op_; BElementwiseOp b_element_op_; CDEElementwiseOp cde_element_op_; - // for checking IsSupportedArgument() std::array a_g_n_k_wos_lengths_; - std::array a_g_n_k_wos_strides_; std::array b_g_k_c_xs_lengths_; - std::array b_g_k_c_xs_strides_; - std::array, NumDTensor> ds_g_n_c_wis_lengths_; - std::array, NumDTensor> ds_g_n_c_wis_strides_; - std::array e_g_n_c_wis_lengths_; - std::array e_g_n_c_wis_strides_; std::array conv_filter_strides_; - std::array conv_filter_dilations_; std::array input_left_pads_; std::array input_right_pads_; }; @@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 arg.Print(); } - float ave_time = 0; + const index_t gdy = arg.num_group_; + const index_t num_workgroups_per_Conv_N = + arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_; + const index_t gdz = num_workgroups_per_Conv_N; + float ave_time = 0; for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++) { if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i], @@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 throw std::runtime_error("wrong! device_op has invalid setting"); } - const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize( - arg.e_grid_desc_m_n_container_[i]) * - arg.num_group_; + const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize( + arg.e_grid_desc_m_n_container_[i]); const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1); @@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, Block2ETileMap, ComputePtrOffsetOfStridedBatch, + ComputePtrOffsetOfStridedBatch, has_main_loop>; return launch_and_time_kernel( stream_config, kernel, - dim3(grid_size), + dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg.p_a_grid_, @@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 arg.a_element_op_, arg.b_element_op_, arg.cde_element_op_, - arg.a_g_n_k_wos_lengths_[0], // Group count arg.a_grid_desc_ak0_m_ak1_container_[i], arg.b_grid_desc_bk0_n_bk1_container_[i], arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i], arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i], arg.block_2_etile_map_container_[i], - arg.compute_ptr_offset_of_batch_); + arg.compute_ptr_offset_of_batch_, + arg.compute_ptr_offset_of_n_); }; if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK)) diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp index 2be0b66812..8df0d885b9 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -13,150 +13,6 @@ namespace ck { namespace tensor_operation { -namespace { -template < - index_t NDimSpatial, - typename ALayout, - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization> -constexpr auto make_out_grid_desc(const index_t N, - const index_t Do, - const index_t Ho, - const index_t Wo, - const index_t K, - const std::array& out_g_n_k_wos_strides) -{ - const auto KStride = Number<1>{}; - - if constexpr(is_same_v) - { - const index_t NStride = out_g_n_k_wos_strides[1]; - const index_t HiStride = out_g_n_k_wos_strides[3]; - const index_t WiStride = out_g_n_k_wos_strides[4]; - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) - { - - return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K), - make_tuple(WiStride, KStride)); - } - else - { - return make_naive_tensor_descriptor(make_tuple(N, Ho, Wo, K), - make_tuple(NStride, HiStride, WiStride, KStride)); - } - } - else if constexpr(is_same_v) - { - const index_t NStride = out_g_n_k_wos_strides[1]; - const index_t DoStride = out_g_n_k_wos_strides[3]; - const index_t HoStride = out_g_n_k_wos_strides[4]; - const index_t WoStride = out_g_n_k_wos_strides[5]; - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) - { - - return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K), - make_tuple(WoStride, KStride)); - } - else - { - return make_naive_tensor_descriptor( - make_tuple(N, Do, Ho, Wo, K), - make_tuple(NStride, DoStride, HoStride, WoStride, KStride)); - } - } - else if constexpr(is_same_v) - { - // assume packed - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) - { - return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)); - } - else - { - return make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K)); - } - } - else if constexpr(is_same_v) - { - // assume packed - if constexpr(ConvBwdDataSpecialization == - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: - Filter1x1Stride1Pad0) - { - return make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K)); - } - else - { - return make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K)); - } - } - else - { - throw std::runtime_error("wrong! unsupported layout: " + ALayout::name()); - } -} - -template -constexpr auto make_wei_grid_desc( - const index_t K, const index_t Z, const index_t Y, const index_t X, const index_t C) -{ - - if constexpr(is_same_v) - { - return make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C)); - } - else if constexpr(is_same_v) - { - return make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C)); - } - else - { - throw std::runtime_error("wrong! unsupported layout: " + BLayout::name()); - } -} - -template -constexpr auto make_in_grid_desc(const index_t N, - const index_t Di, - const index_t Hi, - const index_t Wi, - const index_t C, - const std::array& in_g_n_c_wis_strides) -{ - - if constexpr(is_same_v || - is_same_v || - is_same_v) - { - return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C), - make_tuple(in_g_n_c_wis_strides[1], - in_g_n_c_wis_strides[3], - in_g_n_c_wis_strides[4], - in_g_n_c_wis_strides[2])); - } - else if constexpr(is_same_v || - is_same_v) - { - return make_naive_tensor_descriptor(make_tuple(N, Di, Hi, Wi, C), - make_tuple(in_g_n_c_wis_strides[1], - in_g_n_c_wis_strides[3], - in_g_n_c_wis_strides[4], - in_g_n_c_wis_strides[5], - in_g_n_c_wis_strides[2])); - } - else - { - throw std::runtime_error("wrong! unsupported layout: " + CLayout::name()); - } -} - -} // namespace - template < index_t NDimSpatial, ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization, @@ -166,92 +22,605 @@ template < index_t GemmNPerBlock, index_t GemmKPerBlock, bool DoPadGemmM, - bool DoPadGemmN> + bool DoPadGemmN, + typename ALayout, + typename BLayout, + typename CLayout, + bool SplitN = false, + typename ADataType = float, + typename CDataType = float, + index_t NumGroupsToMerge = 1, + typename IndexType = index_t> struct TransformConvBwdDataToGemm_v1 { + private: static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; static constexpr auto NonSpatialDimsNum = Number<3>{}; - static constexpr auto DIdx = Number{}; + static constexpr auto DIdx = NonSpatialDimsNum; static constexpr auto HIdx = - NDimSpatial == 2 ? Number{} : Number{}; + NDimSpatial == 2 ? NonSpatialDimsNum : Number{}; static constexpr auto WIdx = NDimSpatial == 2 ? Number{} : Number{}; - static constexpr auto ZIdx = Number{}; + static constexpr auto ZIdx = NonSpatialDimsNum; static constexpr auto YIdx = - NDimSpatial == 2 ? Number{} : Number{}; + NDimSpatial == 2 ? NonSpatialDimsNum : Number{}; static constexpr auto XIdx = NDimSpatial == 2 ? Number{} : Number{}; - template || - is_same_v || - is_same_v || - is_same_v), - bool>::type = false> - static auto MakeADescriptor_AK0_M_AK1( - const std::array& out_g_n_k_wos_lengths, - const std::array& out_g_n_k_wos_strides, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& /* input_right_pads */, - const std::array& tildes) + template + static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths, + const ConvDimsType& strides, + index_t i) { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; + long_index_t acc = 1; + for(; i < (NDimSpatial + 3); i++) + { + acc += + static_cast(lengths[i] - I1) * static_cast(strides[i]); + } - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t K = wei_g_k_c_xs_lengths[1]; + return acc; + } - const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1; - const index_t Hi = in_g_n_c_wis_lengths[HIdx]; - const index_t Wi = in_g_n_c_wis_lengths[WIdx]; + template + static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_k_wos_lengths, + const ConvDimsType& a_g_n_k_wos_strides, + const ConvDimsType& c_g_n_c_wis_lengths, + const ConvDimsType& c_g_n_c_wis_strides) + { + const long_index_t a_element_space_size = + calculate_element_space_size_impl(a_g_n_k_wos_lengths, a_g_n_k_wos_strides, I1); + const long_index_t c_element_space_size = + calculate_element_space_size_impl(c_g_n_c_wis_lengths, c_g_n_c_wis_strides, I1); + const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType), + c_element_space_size * sizeof(CDataType)); + constexpr long_index_t TwoGB = (long_index_t{1} << 31); - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; + const IndexType N = a_g_n_k_wos_lengths[I1]; - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; + if(element_space_size > TwoGB) + { + // Minimum divisor of N to not exceed 2GB + const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB); - const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum]; - const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum]; - const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum]; + if(divisor <= static_cast(N)) + { + // Find least divisor of N larger than element_space_size / TwoGB + // Iterate up to sqrt(N). There are no divisors above this value. + for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N; + least_divisor++) + { + if(N % least_divisor == 0) + { + return N / least_divisor; + } + } + // Not found, process one Convolution N per block + return 1; + } + else + { + // Not possible to support even after split N. + // Too large tensor. + return N; + } + } + else + { + // Split N is not needed. + return N; + } + } - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; + public: + __host__ __device__ constexpr TransformConvBwdDataToGemm_v1() {} - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; + template + __host__ __device__ TransformConvBwdDataToGemm_v1( + const TransformConvBwdDataToGemm_v1Base& transform_conv_bwd_data_to_gemm_base) + : N_{static_cast(transform_conv_bwd_data_to_gemm_base.N_)}, + Di_{static_cast(transform_conv_bwd_data_to_gemm_base.Di_)}, + Hi_{static_cast(transform_conv_bwd_data_to_gemm_base.Hi_)}, + Wi_{static_cast(transform_conv_bwd_data_to_gemm_base.Wi_)}, + Do_{static_cast(transform_conv_bwd_data_to_gemm_base.Do_)}, + Ho_{static_cast(transform_conv_bwd_data_to_gemm_base.Ho_)}, + Wo_{static_cast(transform_conv_bwd_data_to_gemm_base.Wo_)}, + Z_{static_cast(transform_conv_bwd_data_to_gemm_base.Z_)}, + Y_{static_cast(transform_conv_bwd_data_to_gemm_base.Y_)}, + X_{static_cast(transform_conv_bwd_data_to_gemm_base.X_)}, + K_{static_cast(transform_conv_bwd_data_to_gemm_base.K_)}, + C_{static_cast(transform_conv_bwd_data_to_gemm_base.C_)}, + DiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.DiStride_)}, + HiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.HiStride_)}, + WiStride_{static_cast(transform_conv_bwd_data_to_gemm_base.WiStride_)}, + DoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.DoStride_)}, + HoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.HoStride_)}, + WoStride_{static_cast(transform_conv_bwd_data_to_gemm_base.WoStride_)}, + CStrideTensorB_{ + static_cast(transform_conv_bwd_data_to_gemm_base.CStrideTensorB_)}, + CStrideTensorC_{ + static_cast(transform_conv_bwd_data_to_gemm_base.CStrideTensorC_)}, + KStrideTensorA_{ + static_cast(transform_conv_bwd_data_to_gemm_base.KStrideTensorA_)}, + KStrideTensorB_{ + static_cast(transform_conv_bwd_data_to_gemm_base.KStrideTensorB_)}, + NStrideTensorA_{ + static_cast(transform_conv_bwd_data_to_gemm_base.NStrideTensorA_)}, + NStrideTensorC_{ + static_cast(transform_conv_bwd_data_to_gemm_base.NStrideTensorC_)}, + ConvStrideD_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideD_)}, + ConvStrideH_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideH_)}, + ConvStrideW_{static_cast(transform_conv_bwd_data_to_gemm_base.ConvStrideW_)}, + ConvDilationD_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationD_)}, + ConvDilationH_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationH_)}, + ConvDilationW_{ + static_cast(transform_conv_bwd_data_to_gemm_base.ConvDilationW_)}, + InLeftPadD_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadD_)}, + InLeftPadH_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadH_)}, + InLeftPadW_{static_cast(transform_conv_bwd_data_to_gemm_base.InLeftPadW_)}, + InRightPadD_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadD_)}, + InRightPadH_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadH_)}, + InRightPadW_{static_cast(transform_conv_bwd_data_to_gemm_base.InRightPadW_)}, + IdxZTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxZTilde_)}, + IdxYTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxYTilde_)}, + IdxXTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.IdxXTilde_)}, + GcdStrideDilationD_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationD_)}, + GcdStrideDilationH_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationH_)}, + GcdStrideDilationW_{ + static_cast(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationW_)}, + ZTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.ZTilde_)}, + YTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.YTilde_)}, + XTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.XTilde_)}, + DTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.DTilde_)}, + HTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.HTilde_)}, + WTilde_{static_cast(transform_conv_bwd_data_to_gemm_base.WTilde_)}, + ZDot_{static_cast(transform_conv_bwd_data_to_gemm_base.ZDot_)}, + YDot_{static_cast(transform_conv_bwd_data_to_gemm_base.YDot_)}, + XDot_{static_cast(transform_conv_bwd_data_to_gemm_base.XDot_)} + { + } + template + __host__ __device__ + TransformConvBwdDataToGemm_v1(const ConvDimsType& a_g_n_k_wos_lengths, + const ConvDimsType& a_g_n_k_wos_strides, + const ConvDimsType& b_g_k_c_xs_lengths, + const ConvDimsType& b_g_k_c_xs_strides, + const ConvDimsType& c_g_n_c_wis_lengths, + const ConvDimsType& c_g_n_c_wis_strides, + const ConvSpatialDimsType& conv_filter_strides, + const ConvSpatialDimsType& conv_filter_dilations, + const ConvSpatialDimsType& input_left_pads, + const ConvSpatialDimsType& input_right_pads, + const ConvSpatialDimsType& tildes) + : Hi_{c_g_n_c_wis_lengths[HIdx]}, + Wi_{c_g_n_c_wis_lengths[WIdx]}, + Ho_{a_g_n_k_wos_lengths[HIdx]}, + Wo_{a_g_n_k_wos_lengths[WIdx]}, + Y_{b_g_k_c_xs_lengths[YIdx]}, + X_{b_g_k_c_xs_lengths[XIdx]}, + K_{a_g_n_k_wos_lengths[I2]}, + C_{b_g_k_c_xs_lengths[I2]}, + HiStride_{c_g_n_c_wis_strides[HIdx]}, + WiStride_{c_g_n_c_wis_strides[WIdx]}, + HoStride_{a_g_n_k_wos_strides[HIdx]}, + WoStride_{a_g_n_k_wos_strides[WIdx]}, + CStrideTensorB_{b_g_k_c_xs_strides[I2]}, + CStrideTensorC_{c_g_n_c_wis_strides[I2]}, + KStrideTensorA_{a_g_n_k_wos_strides[I2]}, + KStrideTensorB_{b_g_k_c_xs_strides[I1]}, + NStrideTensorA_{a_g_n_k_wos_strides[I1]}, + NStrideTensorC_{c_g_n_c_wis_strides[I1]}, + ConvStrideH_{conv_filter_strides[HIdx - NonSpatialDimsNum]}, + ConvStrideW_{conv_filter_strides[WIdx - NonSpatialDimsNum]}, + ConvDilationH_{conv_filter_dilations[HIdx - NonSpatialDimsNum]}, + ConvDilationW_{conv_filter_dilations[WIdx - NonSpatialDimsNum]}, + InLeftPadH_{input_left_pads[HIdx - NonSpatialDimsNum]}, + InLeftPadW_{input_left_pads[WIdx - NonSpatialDimsNum]}, + InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]}, + InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]}, + IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]}, + IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]} + { + static_assert(is_same_v> || + is_same_v>); + static_assert(is_same_v> || + is_same_v>); + + if constexpr(SplitN) + { + N_ = GetSplitedNSize( + a_g_n_k_wos_lengths, a_g_n_k_wos_strides, c_g_n_c_wis_lengths, c_g_n_c_wis_strides); + } + else + { + N_ = c_g_n_c_wis_lengths[I1]; + } + if constexpr(NDimSpatial == 3) + { + Di_ = c_g_n_c_wis_lengths[DIdx]; + Do_ = a_g_n_k_wos_lengths[DIdx]; + Z_ = b_g_k_c_xs_lengths[ZIdx]; + DiStride_ = c_g_n_c_wis_strides[DIdx]; + DoStride_ = a_g_n_k_wos_strides[DIdx]; + ConvStrideD_ = conv_filter_strides[DIdx - NonSpatialDimsNum]; + ConvDilationD_ = conv_filter_dilations[DIdx - NonSpatialDimsNum]; + InLeftPadD_ = input_left_pads[DIdx - NonSpatialDimsNum]; + InRightPadD_ = input_right_pads[DIdx - NonSpatialDimsNum]; + IdxZTilde_ = tildes[ZIdx - NonSpatialDimsNum]; + GcdStrideDilationD_ = math::gcd(ConvStrideD_, ConvDilationD_); + ZTilde_ = ConvStrideD_ / GcdStrideDilationD_; + DTilde_ = Do_ + math::integer_divide_ceil(ConvDilationD_ * (Z_ - I1), ConvStrideD_); + ZDot_ = math::integer_divide_ceil(Z_, ZTilde_); + } + else + { + Di_ = Do_ = Z_ = ZTilde_ = ConvStrideD_ = DTilde_ = ZDot_ = 1; + InLeftPadD_ = InRightPadD_ = DiStride_ = DoStride_ = IdxZTilde_ = 0; + } + + GcdStrideDilationH_ = math::gcd(ConvStrideH_, ConvDilationH_); + GcdStrideDilationW_ = math::gcd(ConvStrideW_, ConvDilationW_); + + YTilde_ = ConvStrideH_ / GcdStrideDilationH_; + XTilde_ = ConvStrideW_ / GcdStrideDilationW_; + + HTilde_ = Ho_ + math::integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_); + WTilde_ = Wo_ + math::integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_); + + YDot_ = math::integer_divide_ceil(Y_, YTilde_); + XDot_ = math::integer_divide_ceil(X_, XTilde_); + } + +#if 0 // At now not supported to split tensor + __host__ bool AreDescriptorsSmallerThan2GB() const + { + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + const long_index_t in_desc_space_size = + I1 + (N_ - I1) * NStrideTensorC_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ + + (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorC_; + const long_index_t out_desc_space_size = + I1 + (N_ - I1) * NStrideTensorA_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ + + (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorA_; + + bool is_a_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(ADataType)) <= TwoGB; + bool is_c_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(CDataType)) <= TwoGB; + + return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB; + } + + __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base, + CDataType* c_grid_ptr_base) const + { + // Create copies + auto conv_to_gemm_transformer_left = *this; + auto conv_to_gemm_transformer_right = *this; + IndexType a_right_offset = 0; + IndexType c_right_offset = 0; + // Calculate real filter size + const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1; + const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1; + const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1; + // Calculate start position in input for right tensor + const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_; + const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_; + const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_; + // Calculate last position in input for left tensor + const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff; + const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff; + const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff; + // Allow to split if whole left padding will be in left tensor and right padding in right + // tensor + const bool is_possible_to_split_d = Do_ != 1 && + di_right_transformer_start_idx > InLeftPadD_ && + di_left_transformer_end_idx <= (InLeftPadD_ + Di_); + const bool is_possible_to_split_h = Ho_ != 1 && + hi_right_transformer_start_idx > InLeftPadH_ && + hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_); + const bool is_possible_to_split_w = Wo_ != 1 && + wi_right_transformer_start_idx > InLeftPadW_ && + wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_); + + if(is_possible_to_split_d) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Do_ = Do_ / 2; + conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_; + conv_to_gemm_transformer_right.InLeftPadD_ = 0; + // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadD_ = 0; + conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_; + // Calculate new input size + conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_; + conv_to_gemm_transformer_right.Di_ = + math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_), + (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff); + ; + // Calcualte offsets + a_right_offset = (Do_ / 2) * DoStride_; + c_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_; + } + else if(is_possible_to_split_h) + { + conv_to_gemm_transformer_left.Ho_ = Ho_ / 2; + conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2; + + conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_; + conv_to_gemm_transformer_right.InLeftPadH_ = 0; + + conv_to_gemm_transformer_left.InRightPadH_ = 0; + conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_; + + conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_; + conv_to_gemm_transformer_right.Hi_ = + math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_), + (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff); + a_right_offset = (Ho_ / 2) * HoStride_; + c_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_; + } + else if(is_possible_to_split_w) + { + conv_to_gemm_transformer_left.Wo_ = Wo_ / 2; + conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2; + + conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_; + conv_to_gemm_transformer_right.InLeftPadW_ = 0; + + conv_to_gemm_transformer_left.InRightPadW_ = 0; + conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_; + + conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_; + conv_to_gemm_transformer_right.Wi_ = + math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_), + (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff); + + a_right_offset = (Wo_ / 2) * WoStride_; + c_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_; + } + // Return left transform, right transformer, right offset to Input and right offset to + // Output + return ck::make_tuple(conv_to_gemm_transformer_left, + conv_to_gemm_transformer_right, + a_grid_ptr_base + a_right_offset, + c_grid_ptr_base + c_right_offset); + } + + __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base, + CDataType* c_grid_ptr_base) const + { + // Create copies + auto conv_to_gemm_transformer_left = *this; + auto conv_to_gemm_transformer_right = *this; + IndexType a_right_offset = 0; + IndexType c_right_offset = 0; + + // Calculate start position in input for right tensor + const IndexType do_right_transformer_start_idx = math::integer_divide_ceil((Di_ / 2) + InLeftPadD_ - ((Z_ - 1) * ConvDilationD_), ConvStrideD_); + const IndexType ho_right_transformer_start_idx = math::integer_divide_ceil((Hi_ / 2) + InLeftPadH_ - ((Y_ - 1) * ConvDilationH_), ConvStrideH_); + const IndexType wo_right_transformer_start_idx = math::integer_divide_ceil((Wi_ / 2) + InLeftPadW_ - ((X_ - 1) * ConvDilationW_), ConvStrideW_); + // Calculate last position in input for left tensor + const IndexType do_left_transformer_end_idx = math::integer_divide_ceil((Di_ / 2 - 1) + InLeftPadD_, ConvStrideD_); + const IndexType ho_left_transformer_end_idx = math::integer_divide_ceil((Hi_ / 2 - 1) + InLeftPadH_, ConvStrideH_); + const IndexType wo_left_transformer_end_idx = math::integer_divide_ceil((Wi_ / 2 - 1) + InLeftPadW_, ConvStrideW_); + + + if(Di_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Di_ = Di_ / 2; + conv_to_gemm_transformer_right.Di_ = Di_ - Di_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_; + conv_to_gemm_transformer_right.InLeftPadD_ = 0; + // // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadD_ = 0; + conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_; + // Calculate new input size + conv_to_gemm_transformer_left.Do_ = do_left_transformer_end_idx; + conv_to_gemm_transformer_right.Do_ = Do_ - do_right_transformer_start_idx; + ; + // Calcualte offsets + a_right_offset = do_right_transformer_start_idx * DoStride_; + c_right_offset = (Di_ / 2) * DiStride_; + } + else if(Hi_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Hi_ = Hi_ / 2; + conv_to_gemm_transformer_right.Hi_ = Hi_ - Hi_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_; + conv_to_gemm_transformer_right.InLeftPadH_ = 0; + // // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadH_ = 0; + conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_; + // Calculate new input size + conv_to_gemm_transformer_left.Ho_ = ho_left_transformer_end_idx ; + conv_to_gemm_transformer_right.Ho_ = Ho_ - ho_right_transformer_start_idx ; + ; + // Calcualte offsets + a_right_offset = ho_right_transformer_start_idx * HoStride_; + c_right_offset = (Hi_ / 2) * HiStride_; + } + else if(Wi_!=1) + { + // Apply new sizes + // Split output on half + conv_to_gemm_transformer_left.Wi_ = Wi_ / 2; + conv_to_gemm_transformer_right.Wi_ = Wi_ - Wi_ / 2; + // Assign left padding to left convolution + conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_; + conv_to_gemm_transformer_right.InLeftPadW_ = 0; + // Assign right padding to right convolution + conv_to_gemm_transformer_left.InRightPadW_ = 0; + conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_; + // Calculate new input size + conv_to_gemm_transformer_left.Wo_ = wo_left_transformer_end_idx; + conv_to_gemm_transformer_right.Wo_ = Wo_ - wo_right_transformer_start_idx; + ; + // Calcualte offsets + a_right_offset = wo_right_transformer_start_idx * WoStride_; + c_right_offset = (Wi_ / 2) * WiStride_; + } + // Return left transform, right transformer, right offset to Input and right offset to + // Output + return ck::make_tuple(conv_to_gemm_transformer_left, + conv_to_gemm_transformer_right, + a_grid_ptr_base + a_right_offset, + c_grid_ptr_base + c_right_offset); + } +#endif + + __host__ __device__ auto MakeOutGridDesc() const + { + if constexpr(is_same_v) + { + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + + return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), + make_tuple(WoStride_, KStrideTensorA_)); + } + else + { + return make_naive_tensor_descriptor( + make_tuple(N_, Ho_, Wo_, K_), + make_tuple(NStrideTensorA_, HoStride_, WoStride_, KStrideTensorA_)); + } + } + else if constexpr(is_same_v) + { + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + + return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_), + make_tuple(WoStride_, KStrideTensorA_)); + } + else + { + return make_naive_tensor_descriptor( + make_tuple(N_, Do_, Ho_, Wo_, K_), + make_tuple(NStrideTensorA_, DoStride_, HoStride_, WoStride_, KStrideTensorA_)); + } + } + else if constexpr(is_same_v) + { + // assume packed + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + return make_naive_tensor_descriptor_packed(make_tuple(N_ * Ho_ * Wo_, K_)); + } + else + { + return make_naive_tensor_descriptor_packed(make_tuple(N_, Ho_, Wo_, K_)); + } + } + else if constexpr(is_same_v) + { + // assume packed + if constexpr(ConvBwdDataSpecialization == + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: + Filter1x1Stride1Pad0) + { + return make_naive_tensor_descriptor_packed(make_tuple(N_ * Do_ * Ho_ * Wo_, K_)); + } + else + { + return make_naive_tensor_descriptor_packed(make_tuple(N_, Do_, Ho_, Wo_, K_)); + } + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + ALayout::name()); + } + } + + __host__ __device__ auto MakeWeiGridDesc() const + { + + if constexpr(is_same_v) + { + return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_)); + } + else if constexpr(is_same_v) + { + return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_)); + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + BLayout::name()); + } + } + + __host__ __device__ auto MakeInGridDesc() const + { + + if constexpr(is_same_v || + is_same_v || + is_same_v) + { + return make_naive_tensor_descriptor( + make_tuple(N_, Hi_, Wi_, C_), + make_tuple(NStrideTensorC_, HiStride_, WiStride_, CStrideTensorC_)); + } + else if constexpr(is_same_v || + is_same_v) + { + return make_naive_tensor_descriptor( + make_tuple(N_, Di_, Hi_, Wi_, C_), + make_tuple(NStrideTensorC_, DiStride_, HiStride_, WiStride_, CStrideTensorC_)); + } + else + { + throw std::runtime_error("wrong! unsupported layout: " + CLayout::name()); + } + } + + template < + typename ALayout_ = ALayout, + typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) && + (is_same_v || + is_same_v || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ auto MakeADescriptor_AK0_M_AK1() const + { // n_do_ho_wo_k for 3d or n_ho_wo_k for 2d - const auto out_grid_desc = - make_out_grid_desc( - N, Do, Ho, Wo, K, out_g_n_k_wos_strides); + const auto out_grid_desc = MakeOutGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - const index_t AK0 = math::integer_divide_ceil(K, AK1); + const index_t AK0 = math::integer_divide_ceil(K_, AK1); // A: output tensor const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N * Do * Ho * Wo), + make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_), make_unmerge_transform(make_tuple(AK0, AK1))), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{})); @@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto ZDot = math::integer_divide_ceil(Z, ZTilde); - const auto YDot = math::integer_divide_ceil(Y, YTilde); - const auto XDot = math::integer_divide_ceil(X, XTilde); - - const auto DTilde = - Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD); - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on HTilde and WTilde that contribute to non-padding area of input tensor const auto IDTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD); + math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_); const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IDTildeSliceEnd = math::min( - DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1); + DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin; const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin; // GemmK is different for each GEMM - const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde); - const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde); - const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde); + const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_); + const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_); + const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_); if constexpr(NDimSpatial == 2) { // A: output tensor const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Ho, I0, I0), - make_pad_transform(Wo, I0, I0), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Ho_, I0, I0), + make_pad_transform(Wo_, I0, I0), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(YDot, HTilde), - make_tuple(-ConvDilationH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, WTilde), - make_tuple(-ConvDilationW / GcdStrideDilationW, I1)), - make_pass_through_transform(K)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(YDot_, HTilde_), + make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)), + make_embed_transform(make_tuple(XDot_, WTilde_), + make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc = transform_tensor_descriptor( out_n_ydot_htilde_xdot_wtilde_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor( out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)), - make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))), + make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)), + make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice))), make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1 // A: output tensor const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( out_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Do, I0, I0), - make_pad_transform(Ho, I0, I0), - make_pad_transform(Wo, I0, I0), - make_pass_through_transform(K)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Do_, I0, I0), + make_pad_transform(Ho_, I0, I0), + make_pad_transform(Wo_, I0, I0), + make_pass_through_transform(K_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple( @@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1 const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor( out_n_hop_wop_k_grid_desc, - make_tuple(make_pass_through_transform(N), + make_tuple(make_pass_through_transform(N_), make_embed_transform( - make_tuple(ZDot, DTilde), - make_tuple(-ConvDilationD / GcdStrideDilationD, I1)), + make_tuple(ZDot_, DTilde_), + make_tuple(-ConvDilationD_ / GcdStrideDilationD_, I1)), make_embed_transform( - make_tuple(YDot, HTilde), - make_tuple(-ConvDilationH / GcdStrideDilationH, I1)), + make_tuple(YDot_, HTilde_), + make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)), make_embed_transform( - make_tuple(XDot, WTilde), - make_tuple(-ConvDilationW / GcdStrideDilationW, I1)), - make_pass_through_transform(K)), + make_tuple(XDot_, WTilde_), + make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1 out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc = transform_tensor_descriptor( out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_slice_transform(ZDot, I0, ZDotSlice), - make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(K)), + make_tuple( + make_pass_through_transform(N_), + make_slice_transform(ZDot_, I0, ZDotSlice), + make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(K_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor( out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc, make_tuple( - make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)), - make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice))), + make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)), + make_merge_transform( + make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice))), make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1 } } - template || - is_same_v), + (is_same_v || + is_same_v), bool>::type = false> - static auto MakeBDescriptor_BK0_N_BK1( - const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& /* input_left_pads */, - const std::array& /* input_right_pads */, - const std::array& tildes) + __host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; - - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t K = wei_g_k_c_xs_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; - - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; - - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; - - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; - // assume packed // k_y_x_c for 2d or k_z_y_x_c for 3d - const auto wei_grid_desc = make_wei_grid_desc(K, Z, Y, X, C); + const auto wei_grid_desc = MakeWeiGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - const index_t BK0 = math::integer_divide_ceil(K, BK1); + const index_t BK0 = math::integer_divide_ceil(K_, BK1); // B: weight tensor const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = - transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)), + transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)), make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), - make_pass_through_transform(C)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, C), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1)); const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = ck::tensor_operation::device::PadTensorDescriptor( @@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto ZDot = math::integer_divide_ceil(Z, ZTilde); - const auto YDot = math::integer_divide_ceil(Y, YTilde); - const auto XDot = math::integer_divide_ceil(X, XTilde); - // GemmK is different for each GEMM - const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde); - const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde); - const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde); + const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_); + const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_); + const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_); // B weight tensor if constexpr(NDimSpatial == 2) @@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor( wei_grid_desc, make_tuple( - make_pass_through_transform(K), - make_embed_transform(make_tuple(YDot, YTilde), - make_tuple(ConvStrideH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, XTilde), - make_tuple(ConvStrideW / GcdStrideDilationW, I1)), - make_pass_through_transform(C)), + make_pass_through_transform(K_), + make_embed_transform(make_tuple(YDot_, YTilde_), + make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)), + make_embed_transform(make_tuple(XDot_, XTilde_), + make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto wei_k_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor( wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc, - make_tuple(make_pass_through_transform(K), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(i_ytilde), - make_freeze_transform(i_xtilde), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_freeze_transform(IdxYTilde_), + make_freeze_transform(IdxXTilde_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, @@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor( wei_k_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)), - make_pass_through_transform(C)), + make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1, 2, 0>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor( wei_grid_desc, - make_tuple( - make_pass_through_transform(K), - make_embed_transform(make_tuple(ZDot, ZTilde), - make_tuple(ConvStrideD / GcdStrideDilationD, I1)), - make_embed_transform(make_tuple(YDot, YTilde), - make_tuple(ConvStrideH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, XTilde), - make_tuple(ConvStrideW / GcdStrideDilationW, I1)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_embed_transform( + make_tuple(ZDot_, ZTilde_), + make_tuple(ConvStrideD_ / GcdStrideDilationD_, I1)), + make_embed_transform( + make_tuple(YDot_, YTilde_), + make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)), + make_embed_transform( + make_tuple(XDot_, XTilde_), + make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor( wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc, - make_tuple(make_pass_through_transform(K), - make_slice_transform(ZDot, I0, ZDotSlice), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(i_ztilde), - make_freeze_transform(i_ytilde), - make_freeze_transform(i_xtilde), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(K_), + make_slice_transform(ZDot_, I0, ZDotSlice), + make_slice_transform(YDot_, I0, YDotSlice), + make_slice_transform(XDot_, I0, XDotSlice), + make_freeze_transform(IdxZTilde_), + make_freeze_transform(IdxYTilde_), + make_freeze_transform(IdxXTilde_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, @@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor( wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)), - make_pass_through_transform(C)), + make_tuple( + make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1, 2, 3, 0>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1 } } - template || - is_same_v || - is_same_v || - is_same_v || - is_same_v), - bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& in_g_n_c_wis_strides, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads, - const std::array& tildes) + template < + typename CLayout_ = CLayout, + typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) && + (is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ auto MakeCDescriptor_M_N() const { - index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum]; - index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum]; - index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum]; - - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1; - const index_t Hi = in_g_n_c_wis_lengths[HIdx]; - const index_t Wi = in_g_n_c_wis_lengths[WIdx]; - - const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1; - const index_t Ho = out_g_n_k_wos_lengths[HIdx]; - const index_t Wo = out_g_n_k_wos_lengths[WIdx]; - - const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1; - const index_t Y = wei_g_k_c_xs_lengths[YIdx]; - const index_t X = wei_g_k_c_xs_lengths[XIdx]; - - const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum]; - const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum]; - const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum]; - - const index_t InRightPadD = input_right_pads[DIdx - NonSpatialDimsNum]; - const index_t InRightPadH = input_right_pads[HIdx - NonSpatialDimsNum]; - const index_t InRightPadW = input_right_pads[WIdx - NonSpatialDimsNum]; - - const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum]; - const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum]; - const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum]; - - const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum]; - const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum]; - const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum]; - // assume strided // n_hi_wi_c for 2d n_di_hi_wi_c for 3d - const auto in_grid_desc = - make_in_grid_desc(N, Di, Hi, Wi, C, in_g_n_c_wis_strides); + const auto in_grid_desc = MakeInGridDesc(); if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: @@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)), - make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)), - make_pass_through_transform(C)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); @@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1 in_n_y_ho_x_wo_c_grid_desc, make_tuple(make_freeze_transform(I0), make_freeze_transform(I0), - make_merge_transform(make_tuple(N, Ho, Wo)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, Ho_, Wo_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}), make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{})); @@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_x_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( in_grid_desc, make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)), - make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)), - make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)), - make_pass_through_transform(C)), + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Do_), make_tuple(I1, ConvStrideD_)), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, @@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1 make_tuple(make_freeze_transform(I0), make_freeze_transform(I0), make_freeze_transform(I0), - make_merge_transform(make_tuple(N, Do, Ho, Wo)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)), + make_pass_through_transform(C_)), make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, @@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1 } else { - const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD); - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto ZTilde = ConvStrideD / GcdStrideDilationD; - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto DTilde = - Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD); - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on DTilde, HTilde and WTilde that contribute to // non-padding area of input tensor const auto IDTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD); + math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_); const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IDTildeSliceEnd = math::min( - DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1); + DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin; const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; @@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1 { const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( in_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Hi_, InLeftPadH_, InRightPadH_), + make_pad_transform(Wi_, InLeftPadW_, InRightPadW_), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor( in_n_hip_wip_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(YTilde, HTilde), - make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(XTilde, WTilde), - make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_embed_transform(make_tuple(YTilde_, HTilde_), + make_tuple(ConvDilationH_, ConvStrideH_)), + make_embed_transform(make_tuple(XTilde_, WTilde_), + make_tuple(ConvDilationW_, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple( Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor( in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_freeze_transform(i_ytilde), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_freeze_transform(i_xtilde), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_freeze_transform(IdxYTilde_), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_freeze_transform(IdxXTilde_), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( in_n_htildeslice_wtildeslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)), - make_pass_through_transform(C)), + make_tuple(make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice)), + make_pass_through_transform(C_)), make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1 { const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor( in_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Di, InLeftPadD, InRightPadD), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_pad_transform(Di_, InLeftPadD_, InRightPadD_), + make_pad_transform(Hi_, InLeftPadH_, InRightPadH_), + make_pad_transform(Wi_, InLeftPadW_, InRightPadW_), + make_pass_through_transform(C_)), make_tuple( Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple( @@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor( in_n_dip_hip_wip_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(ZTilde, DTilde), - make_tuple(ConvDilationD, ConvStrideD)), - make_embed_transform(make_tuple(YTilde, HTilde), - make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(XTilde, WTilde), - make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_embed_transform(make_tuple(ZTilde_, DTilde_), + make_tuple(ConvDilationD_, ConvStrideD_)), + make_embed_transform(make_tuple(YTilde_, HTilde_), + make_tuple(ConvDilationH_, ConvStrideH_)), + make_embed_transform(make_tuple(XTilde_, WTilde_), + make_tuple(ConvDilationW_, ConvStrideW_)), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1 const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor( in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_freeze_transform(i_ztilde), - make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice), - make_freeze_transform(i_ytilde), - make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice), - make_freeze_transform(i_xtilde), - make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice), - make_pass_through_transform(C)), + make_tuple(make_pass_through_transform(N_), + make_freeze_transform(IdxZTilde_), + make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice), + make_freeze_transform(IdxYTilde_), + make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice), + make_freeze_transform(IdxXTilde_), + make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice), + make_pass_through_transform(C_)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc, make_tuple( - make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)), - make_pass_through_transform(C)), + make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice)), + make_pass_through_transform(C_)), make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1 } // for input bias - template || - is_same_v), + (is_same_v || + is_same_v), bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& out_g_n_k_wos_lengths, - const std::array& /* out_g_n_k_wos_strides */, - const std::array& wei_g_k_c_xs_lengths, - const std::array& /* wei_g_k_c_xs_strides */, - const std::array& in_g_n_c_wis_lengths, - const std::array& /* in_g_n_c_wis_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& /* input_right_pads */, - const std::array& /* tildes */) + __host__ __device__ auto MakeCDescriptor_M_N() const { - const index_t N = in_g_n_c_wis_lengths[1]; - const index_t C = wei_g_k_c_xs_lengths[2]; - - const index_t Hi = in_g_n_c_wis_lengths[3]; - const index_t Wi = in_g_n_c_wis_lengths[4]; - - const index_t Ho = out_g_n_k_wos_lengths[3]; - const index_t Wo = out_g_n_k_wos_lengths[4]; - - const index_t Y = wei_g_k_c_xs_lengths[3]; - const index_t X = wei_g_k_c_xs_lengths[4]; - - const index_t InLeftPadH = input_left_pads[0]; - const index_t InLeftPadW = input_left_pads[1]; - - const index_t ConvStrideH = conv_filter_strides[0]; - const index_t ConvStrideW = conv_filter_strides[1]; - - const index_t ConvDilationH = conv_filter_dilations[0]; - const index_t ConvDilationW = conv_filter_dilations[1]; - if constexpr(ConvBwdDataSpecialization == ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { const auto in_gemmm_gemmn_grid_desc = - make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1)); return in_gemmm_gemmn_grid_desc; } else { - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto YTilde = ConvStrideH / GcdStrideDilationH; - const auto XTilde = ConvStrideW / GcdStrideDilationW; - - const auto HTilde = - Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilde = - Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - // only work on HTilde and WTilde that contribute to non-padding area of input tensor const auto IHTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH); + math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_); const auto IWTildeSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW); + math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_); const auto IHTildeSliceEnd = math::min( - HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); + HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1); const auto IWTildeSliceEnd = math::min( - WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); + WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1); const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin; const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin; // bias tensor const auto in_gemmmraw_gemmnraw_grid_desc = make_naive_tensor_descriptor( - make_tuple(N * HTildeSlice * WTildeSlice, C), make_tuple(I0, I1)); + make_tuple(N_ * HTildeSlice * WTildeSlice, C_), make_tuple(I0, I1)); const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor( in_gemmmraw_gemmnraw_grid_desc, @@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1 return in_gemmm_gemmn_grid_desc; } } + + IndexType N_; + IndexType Di_, Hi_, Wi_; + IndexType Do_, Ho_, Wo_; + IndexType Z_, Y_, X_; + IndexType K_, C_; + IndexType DiStride_, HiStride_, WiStride_; + IndexType DoStride_, HoStride_, WoStride_; + IndexType CStrideTensorB_, CStrideTensorC_, KStrideTensorA_, KStrideTensorB_; + IndexType NStrideTensorA_, NStrideTensorC_; + IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_; + IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_; + IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_; + IndexType InRightPadD_, InRightPadH_, InRightPadW_; + IndexType IdxZTilde_, IdxYTilde_, IdxXTilde_; + IndexType GcdStrideDilationD_, GcdStrideDilationH_, GcdStrideDilationW_; + IndexType ZTilde_, YTilde_, XTilde_; + IndexType DTilde_, HTilde_, WTilde_; + IndexType ZDot_, YDot_, XDot_; }; } // namespace tensor_operation diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt index 8edb715200..6d78da8db7 100644 --- a/test/grouped_convnd_bwd_data/CMakeLists.txt +++ b/test/grouped_convnd_bwd_data/CMakeLists.txt @@ -1,6 +1,10 @@ -add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_xdl_wmma.cpp) +add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_data_xdl.cpp) if(result EQUAL 0) - target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) + target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) +endif() +add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp) +if(result EQUAL 0) + target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) endif() add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp) if(result EQUAL 0) diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp new file mode 100644 index 0000000000..7ad7b78d6f --- /dev/null +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include + +#include "profiler/profile_grouped_conv_bwd_data_impl.hpp" + +template +class TestGroupedConvndBwdDataWmma : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using OutLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using InLayout = std::tuple_element_t<3, Tuple>; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndBwdDataWmma2d : public TestGroupedConvndBwdDataWmma +{ +}; + +template +class TestGroupedConvndBwdDataWmma3d : public TestGroupedConvndBwdDataWmma +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndBwdDataWmma2d, Test2D) +{ + this->conv_params.clear(); + + this->conv_params.push_back( + {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp similarity index 78% rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp rename to test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp index 96506b876d..fdc8fb64e5 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -12,7 +12,7 @@ #include "profiler/profile_grouped_conv_bwd_data_impl.hpp" template -class TestGroupedConvndBwdData : public ::testing::Test +class TestGroupedConvndBwdDataXdl : public ::testing::Test { protected: using DataType = std::tuple_element_t<0, Tuple>; @@ -51,35 +51,31 @@ using namespace ck::tensor_layout::convolution; using KernelTypes2d = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple, std::tuple, - std::tuple, - std::tuple>; + std::tuple>; using KernelTypes3d = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple, std::tuple, - std::tuple, - std::tuple>; + std::tuple>; template -class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData +class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl { }; template -class TestGroupedConvndBwdData3d : public TestGroupedConvndBwdData +class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl { }; -TYPED_TEST_SUITE(TestGroupedConvndBwdData2d, KernelTypes2d); -TYPED_TEST_SUITE(TestGroupedConvndBwdData3d, KernelTypes3d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d); -TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) +TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D) { this->conv_params.clear(); @@ -94,10 +90,13 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + // SplitN case + this->conv_params.push_back( + {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}}); this->template Run<2>(); } -TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) +TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D) { this->conv_params.clear(); this->conv_params.push_back( @@ -112,5 +111,17 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); this->conv_params.push_back( {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + // SplitN case + this->conv_params.push_back({3, + 1, + 128, + 4, + 192, + {2, 2, 2}, + {2, 224, 224}, + {1, 224, 224}, + {1, 1, 1}, + {0, 0, 0}, + {0, 0, 0}}); this->template Run<3>(); } From 5e6bd75a725e2c77459bb045b814b7eaded948f9 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Fri, 6 Dec 2024 09:56:27 -0600 Subject: [PATCH 42/52] Add copy assignment op test (#1718) * Add copy assignment op test * Add a deep copy testing --- test/data_type/test_custom_type.cpp | 82 +++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp index a8fa9ba4a0..b8c0d402a2 100644 --- a/test/data_type/test_custom_type.cpp +++ b/test/data_type/test_custom_type.cpp @@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bool_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_int8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_uint8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_f8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bf8_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_half_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_bhalf_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_float_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType) ck::static_for<0, size, 1>{}([&](auto i) { right_vec.template AsType()(Number{}) = custom_double_t{test_vec.at(i)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).data, test_vec.at(i)); @@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType) right_vec.template AsType()(Number{}) = complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)}; }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { ASSERT_EQ(left_vec.template AsType()(Number{}).real, @@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType) right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); }); - // copy the vector - vector_type left_vec{right_vec}; + vector_type left_vec; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { @@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType) right_vec.template AsType()(Number{}) = ck::type_convert(test_vec.at(i)); }); - // copy the vector vector_type left_vec{right_vec}; + // check copy assignment op + left_vec = right_vec; + // overwrite right_vec with 0s + right_vec = vector_type{}; // check if values were copied correctly ck::static_for<0, size, 1>{}([&](auto i) { From 355893cdd85418f3174a023aeb1ddba008951660 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:04:25 -0800 Subject: [PATCH 43/52] Refactor CI performance tests. (#1726) * merge the build and performance tests CI stages together * add gemm performance test on gfx11/gfx12 * add suffices to distinguish gemm performance logs from different archs * use smaller gemm set in CI for gfx10/gfx11/gfx12 * disable performance tests on gfx1030 * fix the shashing logic * fix finding python3 for mha instances --- Jenkinsfile | 286 ++++++------------ .../gpu/mha/CMakeLists.txt | 6 +- script/process_perf_data.py | 4 +- script/process_perf_data.sh | 13 + script/process_qa_data.sh | 12 + script/run_full_performance_tests.sh | 2 +- script/run_gemm_performance_tests.sh | 41 +++ script/run_performance_tests.sh | 21 +- 8 files changed, 176 insertions(+), 209 deletions(-) create mode 100755 script/run_gemm_performance_tests.sh diff --git a/Jenkinsfile b/Jenkinsfile index 58cd72c8ce..0a98cc5c6d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -330,10 +330,8 @@ def cmake_build(Map conf=[:]){ try{ archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_bwd_*.log" - stash name: "perf_fmha_fwd_gfx942.log" - stash name: "perf_fmha_bwd_gfx942.log" - stash name: "perf_fmha_fwd_gfx90a.log" - stash name: "perf_fmha_bwd_gfx90a.log" + stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" + stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." @@ -408,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ } } -def runCKProfiler(Map conf=[:]){ - show_node_info() - - env.HSA_ENABLE_SDMA=0 - checkout scm - - def image = getDockerImageName() - def prefixpath = conf.get("prefixpath", "/opt/rocm") - - // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" - - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - - def variant = env.STAGE_NAME - def retimage - - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { - try { - (retimage, image) = getDockerImage(conf) - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES'){ - sh 'rocminfo | tee rocminfo.log' - if ( !runShell('grep -n "gfx" rocminfo.log') ){ - throw new Exception ("GPU not found") - } - else{ - echo "GPU is OK" - } - } - } - } - catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ - echo "The job was cancelled or aborted" - throw e - } - - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') - { - sh """ - rm -rf build - mkdir build - """ - dir("build"){ - unstash 'ckProfiler.tar.gz' - sh 'tar -xvf ckProfiler.tar.gz' - } - - dir("script"){ - if (params.RUN_FULL_QA){ - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - archiveArtifacts "perf_batched_gemm.log" - archiveArtifacts "perf_grouped_gemm.log" - archiveArtifacts "perf_grouped_conv_fwd.log" - archiveArtifacts "perf_grouped_conv_bwd_data.log" - archiveArtifacts "perf_grouped_conv_bwd_weight.log" - archiveArtifacts "perf_gemm_bilinear.log" - archiveArtifacts "perf_reduction.log" - archiveArtifacts "perf_splitK_gemm.log" - archiveArtifacts "perf_onnx_gemm.log" - archiveArtifacts "perf_mixed_gemm.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - stash name: "perf_batched_gemm.log" - stash name: "perf_grouped_gemm.log" - stash name: "perf_grouped_conv_fwd.log" - stash name: "perf_grouped_conv_bwd_data.log" - stash name: "perf_grouped_conv_bwd_weight.log" - stash name: "perf_gemm_bilinear.log" - stash name: "perf_reduction.log" - stash name: "perf_splitK_gemm.log" - stash name: "perf_onnx_gemm.log" - stash name: "perf_mixed_gemm.log" - //we will process results on the master node - } - else{ - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - //we will process the results on the master node - } - } - } - } - } - return retimage -} - -def runPerfTest(Map conf=[:]){ - try{ - runCKProfiler(conf) - } - catch(e){ - echo "throwing error exception in performance tests" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - if (!conf.get("no_reboot", false)) { - reboot() - } - } -} - def Build_CK(Map conf=[:]){ show_node_info() @@ -589,36 +465,95 @@ def Build_CK(Map conf=[:]){ throw e } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') + timeout(time: 12, unit: 'HOURS') { //check whether to run performance tests on this node - def do_perf_tests = 0 + def arch_type = 0 sh 'rocminfo | tee rocminfo.log' - if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ - do_perf_tests = 1 - echo "Stash profiler and run performance tests" + if ( runShell('grep -n "gfx90a" rocminfo.log') ){ + arch_type = 1 + } + else if ( runShell('grep -n "gfx942" rocminfo.log') ) { + arch_type = 2 + } + else if ( runShell('grep -n "gfx1030" rocminfo.log') ) { + arch_type = 3 + } + else if ( runShell('grep -n "gfx1101" rocminfo.log') ) { + arch_type = 4 + } + else if ( runShell('grep -n "gfx1201" rocminfo.log') ) { + arch_type = 5 } cmake_build(conf) dir("build"){ - //run tests and examples - //sh 'make -j check' - if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ - //we only need the ckProfiler to run the performance tests, so we pack and stash it - //do not stash profiler on nodes where we don't need to run performance tests - sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' - stash name: "ckProfiler.tar.gz" - } - if (params.RUN_FULL_QA && do_perf_tests == 0 ){ - // build deb packages for all gfx9 targets and prepare to export + if (params.RUN_FULL_QA && arch_type == 1 ){ + // build deb packages for all gfx9 targets on gfx90a system and prepare to export + echo "Build ckProfiler package" sh 'make -j package' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' - archiveArtifacts artifacts: 'composablekernel-tests_*.deb' sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb' - stash name: "ckprofiler_0.2.0_amd64.deb" + stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb" } } - if (params.hipTensor_test && do_perf_tests == 0 ){ - //build and test hipTensor + // run performance tests, stash the logs, results will be processed on the master node + dir("script"){ + if (params.RUN_PERFORMANCE_TESTS){ + if (params.RUN_FULL_QA && arch_type == 1){ + // run full tests on gfx90a + echo "Run full performance tests" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + archiveArtifacts "perf_batched_gemm.log" + archiveArtifacts "perf_grouped_gemm.log" + archiveArtifacts "perf_grouped_conv_fwd.log" + archiveArtifacts "perf_grouped_conv_bwd_data.log" + archiveArtifacts "perf_grouped_conv_bwd_weight.log" + archiveArtifacts "perf_gemm_bilinear.log" + archiveArtifacts "perf_reduction.log" + archiveArtifacts "perf_splitK_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_mixed_gemm.log" + stash includes: "perf_**.log", name: "perf_log" + } + else if ( arch_type == 1 ){ + // run standard tests on gfx90a + echo "Run performance tests" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + stash includes: "perf_**.log", name: "perf_log" + } + // disable performance tests on gfx1030 for now. + //else if ( arch_type == 3){ + // run basic tests on gfx1030 + // echo "Run gemm performance tests" + // sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10" + // archiveArtifacts "perf_onnx_gemm_gfx10.log" + // stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10" + //} + else if ( arch_type == 4){ + // run basic tests on gfx11 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11" + archiveArtifacts "perf_onnx_gemm_gfx11.log" + stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11" + } + else if ( arch_type == 5 ){ + // run basic tests on gfx12 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12" + archiveArtifacts "perf_onnx_gemm_gfx12.log" + stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12" + } + } + } + if (params.hipTensor_test && arch_type == 1 ){ + // build and test hipTensor on gfx90a node sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip rm -rf hipTensor-"${params.hipTensor_branch}" @@ -690,10 +625,8 @@ def process_results(Map conf=[:]){ dir("script"){ if (params.RUN_CK_TILE_FMHA_TESTS){ try{ - unstash "perf_fmha_fwd_gfx942.log" - unstash "perf_fmha_bwd_gfx942.log" - unstash "perf_fmha_fwd_gfx90a.log" - unstash "perf_fmha_bwd_gfx90a.log" + unstash "perf_fmha_log_gfx942" + unstash "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." @@ -703,26 +636,26 @@ def process_results(Map conf=[:]){ // unstash perf files to master unstash "ckprofiler_0.2.0_amd64.deb" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" - unstash "perf_batched_gemm.log" - unstash "perf_grouped_gemm.log" - unstash "perf_grouped_conv_fwd.log" - unstash "perf_grouped_conv_bwd_data.log" - unstash "perf_grouped_conv_bwd_weight.log" - unstash "perf_gemm_bilinear.log" - unstash "perf_reduction.log" - unstash "perf_splitK_gemm.log" - unstash "perf_onnx_gemm.log" - unstash "perf_mixed_gemm.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_qa_data.sh" } else{ // unstash perf files to master - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_perf_data.sh" } } @@ -1241,29 +1174,6 @@ pipeline { } } } - - stage("Performance Tests") - { - parallel - { - stage("Run ckProfiler: gfx90a") - { - when { - beforeAgent true - expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() } - } - options { retry(1) } - agent{ label rocmnode("gfx90a")} - environment{ - setup_args = "NO_CK_BUILD" - } - steps{ - runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') - cleanWs() - } - } - } - } stage("Process Performance Test Results") { parallel diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index a53fde1662..0457588ea6 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) # CK Codegen requires dataclass which is added in Python 3.7 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 if(NOT CK_USE_ALTERNATIVE_PYTHON) - find_package(PythonInterp 3 REQUIRED) + find_package(Python3 COMPONENTS Interpreter Development) else() message("Using alternative python version") set(EXTRA_PYTHON_PATH) @@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") # Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. execute_process( - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt --api ${FMHA_KNOWN_APIS} --receipt 3 @@ -50,7 +50,7 @@ endif() # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. add_custom_command( OUTPUT ${FMHA_GEN_BLOBS} - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --output_dir ${FMHA_CPP_FOLDER} --api ${FMHA_KNOWN_APIS} --receipt 3 diff --git a/script/process_perf_data.py b/script/process_perf_data.py index 3892206e42..fbfec94eef 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -82,7 +82,7 @@ def parse_logfile(logfile): StrideA=[] StrideB=[] StrideC=[] - if 'perf_gemm.log' in logfile: + if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() @@ -260,7 +260,7 @@ def main(): conn = sqlEngine.connect() #save gemm performance tests: - if 'perf_gemm.log' in filename: + if 'perf_gemm' in filename and 'gemm_bilinear' not in filename: #write the ck_gemm_test_params table only needed once the test set changes #post_test_params(test_list,conn) for i in range(1,len(results)+1): diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh index af1e7e7a0d..ae93463204 100755 --- a/script/process_perf_data.sh +++ b/script/process_perf_data.sh @@ -11,9 +11,22 @@ #process results python3 process_perf_data.py perf_gemm.log +python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N4.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index c9a1645f6e..fb8fe01c6e 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_mixed_gemm.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index e167ce012b..ddc5c270b8 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -5,7 +5,7 @@ # post your new test results to the database and compare them to the baseline # please contact Illia.Silin@amd.com for more details # -# run the script as "./run_full_performance_tests.sh < node name> +# run the script as "./run_full_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verifuy correctness on CPU (may take a long time) diff --git a/script/run_gemm_performance_tests.sh b/script/run_gemm_performance_tests.sh new file mode 100755 index 0000000000..12adad30f8 --- /dev/null +++ b/script/run_gemm_performance_tests.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ +# run the script as "./run_gemm_performance_tests.sh +# input arguments: +# verification = 0 : do not verify result correctness on CPU +# = 1 : verify correctness on CPU (may take a long time) +# environment tag : a string describing the specifics of your test environment +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# node name : $hostname +# arch : GPU architecture, e.g. "gfx9" or "gfx1100" + +#get the command line arguments: +export verify=$1 +echo 'Verification: ' $verify +export env_type=$2 +echo 'Environment type: ' $env_type +export branch=$3 +echo 'Branch name: ' $branch +export host_name=$4 +echo 'Host name: ' $host_name +export arch=$5 +echo 'GPU architecture: ' $arch + +function print_log_header(){ + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + #get GPU_arch and number of compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; +} + +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm_$arch.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh index 317d270983..c8a281dc07 100755 --- a/script/run_performance_tests.sh +++ b/script/run_performance_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash # # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ -# run the script as "./run_performance_tests.sh < node name> +# run the script as "./run_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verify correctness on CPU (may take a long time) @@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log -#run grouped_fwd fp16 tests -export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log" -print_log_header $conv_fwd_log $env_type $branch $host_name -./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log - -#run grouped_bwd_data fp16 tests -export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log" -print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name -./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log - -#run grouped_bwd_weight fp16 tests -export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log" -print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name -./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log #run resnet50 tests export resnet256_log="perf_resnet50_N256.log" From c773cc25a235dbc3c044b9cf7fb32910bc8fcae0 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 08:50:36 -0800 Subject: [PATCH 44/52] remove unnecessary file (#1732) --- modified_files.txt | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100755 modified_files.txt diff --git a/modified_files.txt b/modified_files.txt deleted file mode 100755 index 34a42e3f37..0000000000 --- a/modified_files.txt +++ /dev/null @@ -1,10 +0,0 @@ -example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp -example/01_gemm/run_gemm_example_streamk_v2.inc -include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp -include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp -library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp -profiler/src/profile_gemm_universal_streamk.cpp -modified_files.txt From 2f088b870764d406ec453987198deb298f3e9e3a Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:32:14 -0800 Subject: [PATCH 45/52] update CI timeout limits (#1733) --- Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0a98cc5c6d..cb344e8a57 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -377,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){ gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 48, unit: 'HOURS') + timeout(time: 20, unit: 'HOURS') { cmake_build(conf) } @@ -449,7 +449,7 @@ def Build_CK(Map conf=[:]){ try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES'){ + timeout(time: 2, unit: 'MINUTES'){ sh 'rocminfo | tee rocminfo.log' if ( !runShell('grep -n "gfx" rocminfo.log') ){ throw new Exception ("GPU not found") @@ -465,7 +465,7 @@ def Build_CK(Map conf=[:]){ throw e } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 12, unit: 'HOURS') + timeout(time: 20, unit: 'HOURS') { //check whether to run performance tests on this node def arch_type = 0 @@ -620,7 +620,7 @@ def process_results(Map conf=[:]){ } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 1, unit: 'HOURS'){ + timeout(time: 15, unit: 'MINUTES'){ try{ dir("script"){ if (params.RUN_CK_TILE_FMHA_TESTS){ From 23cf2026b496140e73a2990199f79e6257b228c7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:11:20 -0800 Subject: [PATCH 46/52] build CI for gfx12 by default (#1734) --- Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cb344e8a57..f118d4e458 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -675,8 +675,8 @@ def process_results(Map conf=[:]){ //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true - 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 13 * * * % BUILD_LEGACY_OS=true''' : "" @@ -763,8 +763,8 @@ pipeline { description: "Test building instances for various architectures simultaneously (default: OFF)") booleanParam( name: "BUILD_GFX12", - defaultValue: false, - description: "Build CK and run tests on gfx12 (default: OFF)") + defaultValue: true, + description: "Build CK and run tests on gfx12 (default: ON)") booleanParam( name: "NINJA_BUILD_TRACE", defaultValue: false, From 94ae7113bd05e3c39364193dba1b391a4c54a2f4 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 10 Dec 2024 11:36:18 +0800 Subject: [PATCH 47/52] [CK TILE] Use config name instead of data type in FmhaFwdTypeConfig (#1731) * Add data type config, Prepare to add mix precision in the future * Fix compile error --- .../ck_tile/01_fmha/codegen/cpp_symbol_map.py | 15 ++- .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 14 +-- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 11 ++- .../01_fmha/codegen/ops/fmha_fwd_appendkv.py | 9 +- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 27 ++--- example/ck_tile/01_fmha/fmha_bwd.cpp | 14 +-- example/ck_tile/01_fmha/fmha_bwd.hpp | 12 ++- example/ck_tile/01_fmha/fmha_fwd.cpp | 99 ++++++++++--------- example/ck_tile/01_fmha/fmha_fwd.hpp | 32 +++++- 9 files changed, 142 insertions(+), 91 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py index 66691356ab..f6df44a318 100644 --- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py +++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py @@ -2,10 +2,17 @@ # Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. # generate kernel instances to speed up compilation -DTYPE_MAP = { - "fp16": "ck_tile::fp16_t", - "bf16": "ck_tile::bf16_t", - "fp8" : "ck_tile::fp8_t" +FWD_DTYPE_MAP = { + "fp16" : "FmhaFwdFp16", + "bf16" : "FmhaFwdBf16", + "fp8" : "FmhaFwdFp8", + "fp8fp16": "FmhaFwdFp8Fp16", + "fp8bf16": "FmhaFwdFp8Bf16" +} + +BWD_DTYPE_MAP = { + "fp16": "FmhaBwdFp16", + "bf16": "FmhaBwdBf16" } MASK_IMPL = { diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 096394c0c9..83a1e82d6d 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -283,7 +283,7 @@ class FmhaBwdApiPool: inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], - F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype], + F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype], F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_deterministic=BOOL_MAP[trait.deterministic]) @@ -360,7 +360,7 @@ class FmhaBwdDQDKDVKernel: FMHA_BWD_DQ_DK_DV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -469,7 +469,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaBwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -585,7 +585,7 @@ class FmhaBwdOGradDotOKernel: FMHA_BWD_DOT_DO_O_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_spad = BOOL_MAP[self.F_spad], F_dvpad = BOOL_MAP[self.F_dvpad], F_mode = MODE_MAP[self.F_mode], @@ -616,7 +616,7 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -716,7 +716,7 @@ class FmhaBwdConvertQGradKernel: FMHA_BWD_CONVERT_DQ_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_bm0, F_bn0 = self.F_bn0, F_spad = BOOL_MAP[self.F_spad], @@ -751,7 +751,7 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index e5ee1d22e7..eca638784d 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -282,7 +282,7 @@ class FmhaFwdApiPool: F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -301,7 +301,7 @@ class FmhaFwdTileSize: F_bk1 : int # tile size along kv gemm unroll F_bk0max : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) F_rm0 : int # number of warps for gemm0 along q seqlen - F_rn0 : int # number of warps for gemm0 along k seqlen + F_rn0 : int # number of warps for gemm0 along k seqlen F_rk0 : int # number of warps for gemm0 along head dim q (not used) F_rm1 : int # number of warps for gemm1 along q seqlen F_rn1 : int # number of warps for gemm1 along head dim v @@ -339,7 +339,7 @@ class FmhaFwdKernel: FMHA_FWD_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -462,6 +462,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm # no need lse/dropout kernels for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -469,7 +472,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm gen = list() api_pool = FmhaFwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py index cfd1d01c91..fb998a33d7 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py @@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool: inners = inners + FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(F_if=if_k, F_vlayout=LAYOUT_MAP[trait.vlayout], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_rope_check=ROPE_CHECK_MAP[trait.rope], F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -216,7 +216,7 @@ class FmhaFwdAppendKVKernel: FMHA_FWD_APPENDKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bs = self.F_tile.F_bs, F_bsk = self.F_tile.F_bsk, F_bd = self.F_tile.F_bd, @@ -301,6 +301,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: # rope/paged-kv is not supported pipelines.append(FmhaFwdAppendKVPipeline('col', 't', 't', 't', 't', 'no', 'f')) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -308,7 +311,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdAppendKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 1c40cf6f31..e448902cf8 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -112,7 +112,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) }} using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, - {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, + {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; #include @@ -161,7 +161,7 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem< typename FmhaFwdTypeConfig::OaccDataType, typename FmhaFwdTypeConfig::ODataType, {F_hdim}, - {F_bm0}, + {F_bm0}, {F_bn1}, {F_mode}, fmha_trait>; @@ -231,11 +231,11 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a if(s.log_level_ > 0) std::cout << ", " << fmha_fwd_splitkv_get_name_() - << ", " << fmha_fwd_splitkv_combine_get_name_() + << ", " << fmha_fwd_splitkv_combine_get_name_() << std::flush; return ck_tile::launch_kernel(s, - [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, + [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_combine_oneshot_(s_, a); }} ); }} @@ -431,11 +431,11 @@ class FmhaFwdSplitKVApiPool: inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout], F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], - F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], + F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -472,7 +472,7 @@ class FmhaFwdSplitKVKernel: FMHA_FWD_SPLITKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -492,7 +492,7 @@ class FmhaFwdSplitKVKernel: F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_skpad = BOOL_MAP[self.F_pipeline.F_skpad], F_dpad = BOOL_MAP[self.F_pipeline.F_dpad], - F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], + F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], F_bias = BIAS_MAP[self.F_pipeline.F_bias], F_lse = BOOL_MAP[self.F_pipeline.F_lse], F_squant = BOOL_MAP[self.F_pipeline.F_squant], @@ -552,7 +552,7 @@ class FmhaFwdSplitKVCombineKernel: FMHA_FWD_SPLITKV_COMBINE_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn1 = self.F_tile.F_bn1, F_spad = BOOL_MAP[self.F_pipeline.F_spad], @@ -625,7 +625,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> pipelines = [] if dtype in ['fp16', 'bf16']: for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): - # TODO: use async pipeline when compiler is more stable + # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) @@ -644,6 +644,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -651,7 +654,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdSplitKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue @@ -711,7 +714,7 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp index 2d76627a72..eaf99529f3 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.cpp +++ b/example/ck_tile/01_fmha/fmha_bwd.cpp @@ -101,7 +101,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) { double rtol = 1e-2; @@ -110,7 +110,7 @@ auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) } template <> -auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) +auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) { double rtol = 1e-2; double atol = 1e-2; @@ -122,7 +122,7 @@ auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_ return ck_tile::make_tuple(rtol, atol); } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -209,7 +209,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q); const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k); - using TypeConfig = FmhaBwdTypeConfig; + using TypeConfig = FmhaBwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -933,7 +933,7 @@ bool run(const ck_tile::ArgParser& arg_parser) } // clang-format on - auto [rtol, atol] = get_elimit(hdim_q, hdim_v); + auto [rtol, atol] = get_elimit(hdim_q, hdim_v); bool dq_cur_pass = ck_tile::check_err(dq_host_result, dq_host_ref, std::string("Error: QGrad Incorrect results!"), @@ -986,11 +986,11 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 722ef15a2f..6204cbcfa8 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -14,11 +14,19 @@ #include #include +struct FmhaBwdFp16 +{ +}; + +struct FmhaBwdBf16 +{ +}; + template struct FmhaBwdTypeConfig; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -38,7 +46,7 @@ struct FmhaBwdTypeConfig }; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 1f0d73d950..ebf2c93a33 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -142,7 +142,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(std::string /*init_method*/) { double rtol = 1e-3; @@ -151,7 +151,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string /*init_method*/) +auto get_elimit(std::string /*init_method*/) { double rtol = 1e-2; double atol = 1e-2; @@ -159,7 +159,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string init_method) +auto get_elimit(std::string init_method) { if(init_method == "ui" || init_method == "ni") { @@ -261,7 +261,7 @@ int override_num_splits_if_necessary( return num_splits; } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -305,8 +305,8 @@ bool run(const ck_tile::ArgParser& arg_parser) } ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim"); - if constexpr(!(std::is_same_v || - std::is_same_v)) + if constexpr(!(std::is_same_v || + std::is_same_v)) { if(0 < rotary_dim) { @@ -428,25 +428,6 @@ bool run(const ck_tile::ArgParser& arg_parser) return atoi(squant_str.c_str()) != 0 ? true : false; }(); - float range_q = arg_parser.get_float("range_q"); - float range_k = arg_parser.get_float("range_k"); - float range_v = arg_parser.get_float("range_v"); - float range_p = arg_parser.get_float("range_p"); - float range_o = arg_parser.get_float("range_o"); - - float dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); - - float scale_p = 1.f; - float scale_o = 1.f; - - if(squant) - { - scale_s = scale_s * (range_q / dtype_max) * (range_k / dtype_max); - scale_p = dtype_max / range_p; - // scale_p = [max(fp8_t)/range_o] * [range_p/max(fp8_t)] * [range_v/max(fp8_t)] - scale_o = range_p * range_v / range_o / dtype_max; - } - std::string vlayout = arg_parser.get_str("vlayout"); bool lse = arg_parser.get_bool("lse"); @@ -499,7 +480,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_k_host = to_seqstarts(seqlen_ks); const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads); - using TypeConfig = FmhaFwdTypeConfig; + using TypeConfig = FmhaFwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -513,6 +494,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using OaccDataType = typename TypeConfig::OaccDataType; using ODataType = typename TypeConfig::ODataType; + float range_q = arg_parser.get_float("range_q"); + float range_k = arg_parser.get_float("range_k"); + float range_v = arg_parser.get_float("range_v"); + float range_p = arg_parser.get_float("range_p"); + float range_o = arg_parser.get_float("range_o"); + + float q_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float k_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float v_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float p_dtype_max = v_dtype_max; // assume p and v is the same type + float o_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + + float scale_p = 1.f; + float scale_o = 1.f; + + if(squant) + { + scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max); + scale_p = p_dtype_max / range_p; + scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max); + } + // accumulation numbers for performance evaluation std::size_t flop = 0, num_byte = 0; auto max_seqlen_q = @@ -709,14 +712,14 @@ bool run(const ck_tile::ArgParser& arg_parser) else if(init_method == "ufq" || init_method == "uf:q" || init_method == "3") // suitable for fp8 quantization { - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(q_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(k_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(knew_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(v_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(vnew_host); + ck_tile::FillUniformDistribution{-q_dtype_max, q_dtype_max, seed}(q_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(k_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(knew_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(v_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(vnew_host); // bias_fp8 = qscale_bias * bias_fp32 - float qscale_bias = (dtype_max / range_q) * (dtype_max / range_k); + float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k); // Assume bias is in [-1.f, 1.f] in original fp32 ck_tile::FillUniformDistribution{-qscale_bias, qscale_bias, seed}(bias_host); } @@ -1129,14 +1132,14 @@ bool run(const ck_tile::ArgParser& arg_parser) randval_buf.FromDevice(randval_host.data()); auto p_compute_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::scales{scale_p}; else return ck_tile::identity{}; }(); auto oacc_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::composes(ck_tile::saturates{}, ck_tile::scales{scale_o}); else @@ -1186,7 +1189,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q); ck_tile::reference_batched_rotary_position_embedding( @@ -1202,13 +1205,13 @@ bool run(const ck_tile::ArgParser& arg_parser) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]); }); - } else { + } else { k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]); }); } } else -#endif +#endif { if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); }); else k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); }); @@ -1229,7 +1232,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { knew_host_ref_ro.emplace(knew_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew); ck_tile::reference_batched_rotary_position_embedding( @@ -1251,19 +1254,19 @@ bool run(const ck_tile::ArgParser& arg_parser) if(0 < page_block_size) { if(is_v_rowmajor) { if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { - self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); + v_host_ref.ForEach([&](auto& self, auto i) { + self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); }); } else { - v_host_ref.ForEach([&](auto& self, auto i) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]); }); } } - else + else { - if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { + if(i_perm) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size); }); } else { @@ -1458,7 +1461,7 @@ bool run(const ck_tile::ArgParser& arg_parser) else o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); }); // clang-format on - auto [rtol, atol] = get_elimit(init_method); + auto [rtol, atol] = get_elimit(init_method); bool cur_pass = ck_tile::check_err( o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); pass &= cur_pass; @@ -1515,15 +1518,15 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "fp8") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 8a821b9177..aee54b4758 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -16,11 +16,35 @@ #include #include +struct FmhaFwdFp16 +{ +}; + +struct FmhaFwdBf16 +{ +}; + +struct FmhaFwdFp8 +{ +}; + +struct FmhaFwdBf8 +{ +}; + +struct FmhaFwdFp8Fp16 +{ +}; + +struct FmhaFwdFp8Bf16 +{ +}; + template struct FmhaFwdTypeConfig; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -36,7 +60,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; @@ -52,7 +76,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::fp8_t; using KDataType = ck_tile::fp8_t; @@ -68,7 +92,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf8_t; using KDataType = ck_tile::bf8_t; From 67497a044d450fbc0bcb099cfb0aa270cfb0aa6b Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:47:36 +0000 Subject: [PATCH 48/52] Make sure we call __hneg with half to remove ambigios error (#1736) --- include/ck/utility/math_v2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index a6c3540d85..eaa1c68138 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -611,7 +611,7 @@ inline __device__ int8_t neg(int8_t x) template <> inline __device__ half_t neg(half_t x) { - return __hneg(x); + return __hneg(static_cast<__half>(x)); }; template From 90d8410d562220ba65e7e75f10e7b3996409200f Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 10 Dec 2024 08:48:51 -0800 Subject: [PATCH 49/52] Upgrade to Ubuntu22.04 as default OS. (#1738) * upgrade to ubuntu 22.04 * try adding -u roof docker options for ubuntu 22 --- Dockerfile | 5 +++-- Dockerfile.compiler | 2 +- Jenkinsfile | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6689ae08ff..8ce158a200 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive ARG ROCMVERSION=6.3 ARG compiler_version="" @@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- libnuma-dev \ libpthread-stubs0-dev \ llvm-amdgpu \ + mpich \ net-tools \ pkg-config \ python \ @@ -70,7 +71,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- rm -rf /var/lib/apt/lists/* && \ rm -rf amdgpu-install* && \ # Remove unnecessary rocm components that take a lot of space - apt-get remove -y rocblas rocfft rocsparse composablekernel-dev + apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt # Update the cmake to version 3.27.5 RUN pip install --upgrade cmake==3.27.5 && \ diff --git a/Dockerfile.compiler b/Dockerfile.compiler index 3f33290929..a22103b96b 100644 --- a/Dockerfile.compiler +++ b/Dockerfile.compiler @@ -1,4 +1,4 @@ -ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3" +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3" FROM $BASE_DOCKER ARG compiler_version="" ARG compiler_commit="" diff --git a/Jenkinsfile b/Jenkinsfile index f118d4e458..f82c34afa6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -40,10 +40,10 @@ def getBaseDockerImageName(){ else{ def ROCM_numeric = "${params.ROCMVERSION}" as float if ( ROCM_numeric < 6.4 ){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}" } else{ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}" + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}" } } return img @@ -357,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){ def prefixpath = conf.get("prefixpath", "/opt/rocm") // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } @@ -426,7 +426,7 @@ def Build_CK(Map conf=[:]){ def prefixpath = conf.get("prefixpath", "/opt/rocm") // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } From 357a0b1c57d2f6b4eb9607d26047ba2e0b679f72 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:16:03 -0800 Subject: [PATCH 50/52] add missing stdexcept header (#1740) --- codegen/test/rtc/include/rtc/hip.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/codegen/test/rtc/include/rtc/hip.hpp b/codegen/test/rtc/include/rtc/hip.hpp index 6b523382dc..e962d4cd3e 100644 --- a/codegen/test/rtc/include/rtc/hip.hpp +++ b/codegen/test/rtc/include/rtc/hip.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace rtc { From 77a38e0211f587775c233fc0afd4de819d51500c Mon Sep 17 00:00:00 2001 From: carlushuang Date: Thu, 12 Dec 2024 03:54:03 +0000 Subject: [PATCH 51/52] [CK_TILE] naive attn (#1708) * add reference attention fwd * refactor addresser * update * paged, and i8 reflect-quant * lets call it forward-quant * fix error in decode variation * update naive-attn * fix page table * fix build err --- example/ck_tile/01_fmha/fmha_fwd.cpp | 57 +- include/ck_tile/README.md | 3 + include/ck_tile/core.hpp | 1 + include/ck_tile/ops/gemm.hpp | 2 +- include/ck_tile/ref/README.md | 5 + include/ck_tile/ref/naive_attention.hpp | 666 ++++++++++++++++++++++++ include/ck_tile/remod.py | 4 + 7 files changed, 734 insertions(+), 4 deletions(-) create mode 100644 include/ck_tile/ref/README.md create mode 100644 include/ck_tile/ref/naive_attention.hpp diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index ebf2c93a33..08d263da91 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -3,6 +3,7 @@ #include "fmha_fwd.hpp" #include "ck_tile/host.hpp" +#include "ck_tile/ref/naive_attention.hpp" #include "mask.hpp" #include "rotary.hpp" #include "utils.hpp" @@ -41,7 +42,7 @@ std::ostream& operator<<(std::ostream& os, const std::vector& v) auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("v", "1", "weather do CPU validation or not") + arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)") .insert("mode", "0", "kernel mode. 0:batch, 1:group") .insert("b", "2", "batch size") .insert("h", "8", "num of head, for q") @@ -447,7 +448,7 @@ bool run(const ck_tile::ArgParser& arg_parser) } bool s_randval = false; - if(p_drop > 0.0f && do_validation) + if(p_drop > 0.0f && do_validation != 0) { s_randval = true; } @@ -1121,11 +1122,61 @@ bool run(const ck_tile::ArgParser& arg_parser) << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec << " GB/s" << std::flush; - if(!do_validation) + if(do_validation == 0) { std::cout << std::flush << std::endl; return true; } + if(do_validation == 2) + { + // NOTE: use gpu to do validation + ck_tile::naive_attention_fwd_traits naive_t; + naive_t.q_type = data_type; + naive_t.k_type = data_type; + naive_t.v_type = data_type; + naive_t.o_type = data_type; + naive_t.q_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.k_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.v_layout = i_perm == 1 ? "bhsd" : "bshd"; + naive_t.o_layout = o_perm == 1 ? "bhsd" : "bshd"; + naive_t.variation = 0; // TODO? + + ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes()); + + ck_tile::naive_attention_fwd_args naive_a; + naive_a.q_ptr = q_buf.GetDeviceBuffer(); + naive_a.k_ptr = k_buf.GetDeviceBuffer(); + naive_a.v_ptr = v_buf.GetDeviceBuffer(); + naive_a.o_ptr = o_naive_buf.GetDeviceBuffer(); + naive_a.scale_s = scale_s; + naive_a.context_len_ptr = nullptr; // used when seqlen kv come from a pointer + naive_a.page_table_ptr = + nullptr; // [batch, num_blocks] seqlen_kv is in different block(paged attn) + naive_a.hdim = hdim_q; + naive_a.hdim_v = hdim_v; // could be cross-attn, where V and Q/K hdim are different + naive_a.batch_q = batch; + naive_a.batch_kv = batch; + naive_a.batch_ratio_kv = 1; // batch_q / batch_kv + naive_a.seqlen_q = seqlen_qs[0]; + naive_a.seqlen_kv = seqlen_ks[0]; // if context_len_ptr is not nullptr, ignore this field + naive_a.nhead_q = nhead; + naive_a.nhead_kv = nhead_k; + naive_a.nhead_ratio_kv = naive_a.nhead_q / naive_a.nhead_kv; // nhead_q / nhead_kv + naive_a.page_size = 0; // if paged, the seqlen-kv for each block + + ck_tile::stream_config naive_s{}; + + naive_attention_fwd(naive_t, naive_a, naive_s); + + auto o_naive_ref = o_naive_buf.ToHost(); + o_buf.FromDevice(o_host.data()); // TODO: ugly + + auto [rtol_, atol_] = get_elimit(init_method); + bool pass_ = ck_tile::check_err( + o_host, o_naive_ref, std::string("OUT Error: Incorrect results!"), rtol_, atol_); + std::cout << ", valid:" << (pass_ ? "y" : "n") << std::flush << std::endl; + return pass_; + } o_buf.FromDevice(o_host.data()); lse_buf.FromDevice(lse_host.data()); diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md index 9f88af1ca1..9d5e923915 100644 --- a/include/ck_tile/README.md +++ b/include/ck_tile/README.md @@ -45,5 +45,8 @@ our implementation of different device operators. **[ops/epilogue]** epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues. +**[ref]** +reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand. + ## examples currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder. diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 3cf0c2595d..41f3383c7f 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -54,6 +54,7 @@ #include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/update_tile.hpp" +#include "ck_tile/core/utility/amd_address_space.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional_with_tuple.hpp" diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 82d35b9c59..2d38ef5925 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -23,10 +23,10 @@ #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp" #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp" #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" -#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ref/README.md b/include/ck_tile/ref/README.md new file mode 100644 index 0000000000..6efee782f6 --- /dev/null +++ b/include/ck_tile/ref/README.md @@ -0,0 +1,5 @@ +# reference + +this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher. + +TODO: move `host/reference` under this folder diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp new file mode 100644 index 0000000000..09ded761eb --- /dev/null +++ b/include/ck_tile/ref/naive_attention.hpp @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include +#include + +namespace ck_tile { + +enum class naive_attention_layout_enum +{ + BSHD, // [batch, seqlen, nhead, hdim] + BHSD, // [batch, nhead, seqlen, hdim] + BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed + PHSD, // [pages, nhead, page_size, hdim] + // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen + PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen + PHDS, // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen +}; + +// will used to specialize kernel variation +enum class naive_attention_variation_enum +{ + FLASH_BATCHED = 0, // standard flash attention, or xformer/sdpa, used for training + FLASH_GROUPED, + DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache +}; + +// TODO: for simplicity, this will be used as host/device arg +struct naive_attention_fwd_args +{ + void* q_ptr; + void* k_ptr; + void* v_ptr; + void* o_ptr; + void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a + // number, not cumsum) + void* page_table_ptr; // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn) + void* kvscale_ptr; // [nhead, 2(kv), hdim] used for kvcache dequant + float scale_s; + int hdim; + int hdim_v; // could be cross-attn, where V and Q/K hdim are different + int batch_q; + int batch_kv; + int batch_ratio_kv; // batch_q / batch_kv + int seqlen_q; // in decode case, this should be 1 + int seqlen_kv; // if context_len_ptr is not nullptr, ignore this field + int nhead_q; + int nhead_kv; + int nhead_ratio_kv; // nhead_q / nhead_kv + int page_size; // if paged, the seqlen-kv per each block + int max_pages_per_seq; +}; + +// this is trait for host API +struct naive_attention_fwd_traits +{ + std::string q_type; + std::string k_type; + std::string v_type; + std::string o_type; + std::string q_layout; + std::string k_layout; + std::string v_layout; + std::string o_layout; + int variation; // sync with naive_attention_variation_enum +}; + +// this is trait for kernel template +template +struct naive_attention_fwd_kernel_traits +{ + static constexpr naive_attention_variation_enum variation = variation_; +}; + +// for simplicity, please do not use const-reference type for the template type +template +struct naive_attention_fwd_kernel +{ + static constexpr bool is_kvcache_i8 = + std::is_same_v && std::is_same_v && sizeof(QType) != 1; + + // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original + // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32 + static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8; + + // TODO: hardcode + using KVScaleType = float; + using SoftmaxType = float; + using PType = VType; // src A of gemm2, same type as V + + using p_vec_type = ext_vector_t; + static constexpr int p_vec_elem = vector_traits::vector_size; + + __host__ __device__ naive_attention_fwd_kernel() {} + + template + struct addresser + { + int b, s, h, d; // batch, seqlen, nhead, hdim + T* base_ptr; + __device__ addresser(int b_, int s_, int h_, int d_, void* base_ptr_) + : b(b_), s(s_), h(h_), d(d_), base_ptr(reinterpret_cast(base_ptr_)) + { + } + + // TODO: all the batch/nhead offset will accumulate to the base pointer + __device__ T* get_base(int i_b, int i_h) + { + if constexpr(Layout == naive_attention_layout_enum::BSHD) + return base_ptr + i_b * s * h * d + i_h * d; + else if constexpr(Layout == naive_attention_layout_enum::BHSD) + return base_ptr + i_b * s * h * d + i_h * s * d; + } + + __device__ int get_offset(int i_s, int i_d) + { + if constexpr(Layout == naive_attention_layout_enum::BSHD) + return i_s * h * d + i_d; + else if constexpr(Layout == naive_attention_layout_enum::BHSD) + return i_s * d + i_d; + } + + // below set of API will directly use pointer inside this struct + __device__ void init(int i_b, int i_h) { base_ptr = get_base(i_b, i_h); } + __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; } + __device__ void store(T value, int i_s, int i_d) { base_ptr[get_offset(i_s, i_d)] = value; } + }; + + template + struct page_addresser + { + int s, h, d; // page_size, nhead, hdim + static constexpr int x = 16 / sizeof(T); // pack 4 dword + T* base_ptr; + int* page_table_ptr; // TODO: page table always int + int i_h; // store current head + + __device__ page_addresser(int s_, int h_, int d_, void* base_ptr_, void* pptr_) + : s(s_), + h(h_), + d(d_), + base_ptr(reinterpret_cast(base_ptr_)), + page_table_ptr(reinterpret_cast(pptr_)) + { + } + + __device__ int64_t get_phy_page_idx(int i_s) + { + // dynamic compute page idx is simple but slow + int page_idx = i_s / s; + int phy = page_table_ptr[page_idx]; + return static_cast(phy); + } + + __device__ int get_phy_page_offset(int i_s) + { + // dynamic compute page idx is simple but slow + return i_s % s; + } + + __device__ int64_t get_offset(int i_s, int i_d) + { + int page_offset = get_phy_page_offset(i_s); + int64_t page_idx = get_phy_page_idx(i_s); + int64_t base_ = page_idx * h * s * d; + if constexpr(Layout == naive_attention_layout_enum::PHSD) + return static_cast(i_h * s * d + page_offset * d + i_d) + base_; + else if constexpr(Layout == naive_attention_layout_enum::PHDSX) + { + int d_r = i_d / x; + int d_x = i_d % x; + return static_cast(i_h * d * s + d_r * s * x + page_offset * x + d_x) + + base_; + } + else if constexpr(Layout == naive_attention_layout_enum::PHDS) + { + return static_cast(i_h * d * s + i_d * s + page_offset) + base_; + } + } + + // below set of API will directly use pointer inside this struct + __device__ void init(int /*i_b*/, int i_h_) { i_h = i_h_; } + __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; } + __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {} + }; + + template + struct kvscale_addresser + { + int h, d; // nhead, hdim + T* base_ptr; + __device__ kvscale_addresser(int h_, int d_, void* p_) + : h(h_), d(d_), base_ptr(reinterpret_cast(p_)) + { + } + __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/) + { + // [h, 2, d] + return i_h * 2 * d + i_kv * d + i_d; + } + __device__ T load(int i_h, int i_d, int i_kv) + { + return base_ptr[get_offset(i_h, i_d, i_kv)]; + } + }; + + __device__ __host__ static constexpr int get_block_size() { return 256; } + + // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv + // compute all hdim from q, compute WG_SIZE hdim from v + // 1) in prefill case, seqlen_q >= 1, seqlen_kv >= 1, batch_q=batch_kv + // 2) in decode case, seqlen_q = 1, batch_q is input num-tokens, batch_kv is 1 + // 3) in paged-attn case, we still use 1 WG compute all the seqlen-kv for simplicity + // TODO: could support split-kv to validate intermediate logsum + __host__ static dim3 get_grid_size(naive_attention_fwd_args args) + { + constexpr int wg_size = get_block_size(); + auto g = + dim3((args.hdim_v + wg_size - 1) / wg_size, args.seqlen_q, args.batch_q * args.nhead_q); + return g; + } + + // reduce single pixel within a wave + template + __device__ constexpr T wave_reduce(T local, F reduce_f) + { + // constexpr int wave_size = 64; + constexpr int reduce_stage = 6; // 1<<6=64 + T v_local = local; +#pragma unroll + for(int i_stage = 0; i_stage < reduce_stage; i_stage++) + { + int src_lane = __lane_id() ^ (1 << i_stage); + int32_t v_remote_tmp = + __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast(v_local)); + T v_remote = bit_cast(v_remote_tmp); + v_local = reduce_f(v_local, v_remote); + } + return v_local; + } + + // Note: this function must be called after wave_reduce + // Note: better not use this under if...else... with thread divergence (syncthreads) + template + __device__ constexpr T cross_wave_reduce(T local, F reduce_f, T* smem) + { + constexpr int waves = 4; + constexpr int wave_size = 64; + int lane_id = threadIdx.x % wave_size; + + __syncthreads(); + smem[threadIdx.x] = local; + __syncthreads(); + + // the data within single wave is the same + // but for simplicity, we still use data from each lane. + T v_local = smem[lane_id]; +#pragma unroll + for(int i_stage = 1; i_stage < waves; i_stage++) + { + T v_remote = smem[i_stage * wave_size + lane_id]; + v_local = reduce_f(v_local, v_remote); + } + return v_local; + } + + // kernel entry point + __device__ void operator()(naive_attention_fwd_args args) + { + constexpr int wg_size = get_block_size(); + __shared__ char smem[wg_size * 4 * sizeof(float)]; // should enough + int i_dv = blockIdx.x * wg_size + threadIdx.x; // index of hdim_v + int i_sq = blockIdx.y; // index of seqlen_q + int i_batch = blockIdx.z; // index of batch_q * nhead_q + int i_bq = i_batch / args.nhead_q; // index of batch_q + int i_hq = i_batch % args.nhead_q; // index of nhead_q + + int i_bk = i_bq / args.batch_ratio_kv; + int i_hk = i_hq / args.nhead_ratio_kv; + + void* page_table_ptr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return reinterpret_cast(args.page_table_ptr) + i_bq * args.max_pages_per_seq; + } + else + { + return nullptr; + } + }(); + + auto q_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr}; + } + }(); + auto k_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim, args.k_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return page_addresser{ + args.page_size, args.nhead_kv, args.hdim, args.k_ptr, page_table_ptr}; + } + }(); + auto v_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim_v, args.v_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return page_addresser{ + args.page_size, args.nhead_kv, args.hdim_v, args.v_ptr, page_table_ptr}; + } + }(); + auto o_addr = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr}; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return addresser{ + args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr}; + } + }(); + + q_addr.init(i_bq, i_hq); + k_addr.init(i_bk, i_hk); + v_addr.init(i_bk, i_hk); + o_addr.init(i_bq, i_hq); + + auto f_max = [](auto x_, auto y_) { return max(x_, y_); }; + auto f_sum = [](auto x_, auto y_) { return x_ + y_; }; + auto f_absmax_f32 = [](float v_0_, float v_1_) { + float rtn; + asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_)); + return rtn; + }; + + int seqlen_kv = [&]() { + if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED) + { + return args.seqlen_kv; + } + else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED) + { + return reinterpret_cast(args.context_len_ptr)[i_bq]; + } + }(); + + SoftmaxType row_max = -numeric::infinity(); + SoftmaxType l{0}; + AccType o_acc = {0}; + + int sk_loops = (seqlen_kv + wg_size - 1) / wg_size; + float qf_scale = .0f; + kvscale_addresser kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr}; + + if constexpr(is_kvcache_i8_forward_quant) + { + // AccType is i32 now, seqlen_q = 1, hdim up to 256 + float q = 0; + float k_s = 0; + if(static_cast(threadIdx.x) < args.hdim) + { + q = type_convert(q_addr.load(0, threadIdx.x)); + k_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 0)); + } + // 1) we apply the k scale to q + float q_forwarded = q * k_s; + + // 2) apply smooth-quant + // find absmax + float qf_max = wave_reduce(q_forwarded, f_absmax_f32); + qf_max = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + qf_scale = qf_max / 127.0; + + // devide by scale + q = q / qf_scale; + + // fp32->i8 + int8_t quantized_q = static_cast(q); + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = quantized_q; + __syncthreads(); + + // after above process, we have 2 data + // 1) int8 q data stored in smem(no need to reload) + // 2) per-token scale qf_scale, to be mul after 1st gemm + } + + for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++) + { + int i_sk = i_loop1 * wg_size + threadIdx.x; + // gemm-1 + SoftmaxType s_softmax = -numeric::infinity(); + if(i_sk < seqlen_kv) + { + AccType s_acc{0}; // clear for every loop + for(auto i_dq = 0; i_dq < args.hdim; i_dq++) + { + if constexpr(is_kvcache_i8_forward_quant) + { + int8_t q = reinterpret_cast(smem)[i_dq]; + auto k = k_addr.load(i_sk, i_dq); + + s_acc += type_convert(q) * type_convert(k); + } + else + { + auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load + auto k = k_addr.load(i_sk, i_dq); + + s_acc += type_convert(q) * type_convert(k); + } + } + // scale + s_softmax = type_convert(s_acc); + s_softmax *= + type_convert(args.scale_s * ck_tile::log2e_v); + if constexpr(is_kvcache_i8_forward_quant) + { + s_softmax *= qf_scale; // post scale the per-token factor + } + } + + // s->p + float pf_scale = 0.; // used for i8 quant + { + // softmax, find max + SoftmaxType old_max = row_max; + SoftmaxType cur_max = wave_reduce(s_softmax, f_max); + + cur_max = cross_wave_reduce(cur_max, f_max, reinterpret_cast(smem)); + row_max = max(old_max, cur_max); // update row_max + // softmax, exp(i_elem - max) + SoftmaxType p_compute = __builtin_amdgcn_exp2f(s_softmax - row_max); + + // compute exp_sum + SoftmaxType row_sum = wave_reduce(p_compute, f_sum); + row_sum = cross_wave_reduce(row_sum, f_sum, reinterpret_cast(smem)); + + // l, pre-scall o_acc + SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max); + l = tmp * l + row_sum; + o_acc = type_convert(type_convert(o_acc) * tmp); + + // prepare the p_compute into smem, to let every thread read same p_compute and do + // 2nd gemm + if constexpr(is_kvcache_i8_forward_quant) + { + float v_s = 0; + if(static_cast(threadIdx.x) < args.hdim_v) + { + v_s = type_convert(kvscale_addr.load(i_hk, threadIdx.x, 1)); + } + + // 1) we apply the v scale to p + float p_forwarded = p_compute * v_s; + + // 2) apply smooth-quant + // find absmax + float pf_max = wave_reduce(p_forwarded, f_absmax_f32); + pf_max = + cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast(smem)); + + // per-token scale + pf_scale = pf_max / 127.0; + + // devide by scale + p_compute = p_compute / pf_scale; + + // fp32->i8 + int8_t quantized_p = static_cast(p_compute); + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = quantized_p; + __syncthreads(); + // after above process, we have 2 data + // 1) int8 p data stored in smem(no need to reload) + // 2) per-token scale pf_scale, to be mul after 2nd gemm + } + else + { + __syncthreads(); + reinterpret_cast(smem)[threadIdx.x] = type_convert(p_compute); + __syncthreads(); + } + } + + // gemm-2, simple loop over vector by vector + constexpr int gemm_2_loop = wg_size / p_vec_elem; + { + AccType o_acc_local = {0}; + int sk_start = i_loop1 * wg_size; // we start from the first seqlen_kv element + for(int i_loop2 = 0; i_loop2 < gemm_2_loop; i_loop2++) + { + p_vec_type p_vec = reinterpret_cast(smem)[i_loop2]; +#pragma unroll + for(int i_j = 0; i_j < p_vec_elem; i_j++) + { + int sv_offset = i_loop2 * p_vec_elem + i_j; + int i_sv = sk_start + sv_offset; + + VType v = 0.f; + if(i_dv < args.hdim_v && i_sv < seqlen_kv) + { + v = v_addr.load(i_sv, i_dv); + } + + o_acc_local += type_convert(p_vec[i_j]) * type_convert(v); + } + } + if constexpr(is_kvcache_i8_forward_quant) + { + // apply pr scale to local acc + o_acc_local = + type_convert(type_convert(o_acc_local) * pf_scale); + } + o_acc += o_acc_local; + } + } + + // post scale o_acc + { + SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking + o_acc = type_convert(type_convert(o_acc) * tmp); + } + + // store O + if(i_dv < args.hdim_v) + o_addr.store(type_convert(o_acc), i_sq, i_dv); + } +}; + +#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_() \ + { \ + using ktraits_ = \ + naive_attention_fwd_kernel_traits( \ + variation_)>; \ + using k_ = naive_attention_fwd_kernel; \ + dim3 grids = k_::get_grid_size(a); \ + r = ck_tile::launch_kernel(s, \ + ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \ + } + +#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_() \ + if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \ + t.o_layout == "bshd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \ + constexpr int variation_ = 0; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } \ + else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" && \ + t.v_layout == "bhsd" && t.o_layout == "bhsd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr int variation_ = 0; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } \ + else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" && \ + t.v_layout == "phds" && t.o_layout == "bhsd") \ + { \ + constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \ + constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \ + constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \ + constexpr int variation_ = 2; \ + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \ + } + +// +CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t, + naive_attention_fwd_args a, + ck_tile::stream_config s) +{ + float r = -1; + // TODO: do not explicitly create too much instance! + if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16") + { + using q_type_ = fp16_t; + using k_type_ = fp16_t; + using v_type_ = fp16_t; + using o_type_ = fp16_t; + using acc_type_ = float; + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16") + { + using q_type_ = bf16_t; + using k_type_ = bf16_t; + using v_type_ = bf16_t; + using o_type_ = bf16_t; + using acc_type_ = float; + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16") + { + using q_type_ = bf16_t; + using k_type_ = int8_t; + using v_type_ = int8_t; + using o_type_ = bf16_t; + using acc_type_ = int32_t; // NOTE! + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16") + { + using q_type_ = fp16_t; + using k_type_ = int8_t; + using v_type_ = int8_t; + using o_type_ = fp16_t; + using acc_type_ = int32_t; // NOTE! + CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_(); + } + return r; +} + +#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_ +#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_ + +} // namespace ck_tile diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index b0d2c36efe..9f2ef3389f 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -7,6 +7,7 @@ import copy NS = 'ck_tile' OPS = 'ops' +REF = 'ref' OPS_COMMON = 'common' # common header will be duplicated into ops/* other module HEADER_COMMON = f"""// SPDX-License-Identifier: MIT @@ -29,6 +30,9 @@ class submodule_t: def push(self, f): if len(f.parents) != 1: # ignore ./xxx.hpp mod = get_module(f) + # ref is supposed to include one header on demand + if mod == REF: + return if mod == OPS: if mod not in self.m.keys(): self.m[mod] = dict() From 4e73177684817d425fc583b8827dd09d0c609e94 Mon Sep 17 00:00:00 2001 From: chenjun <46212055+junhaha666@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:53:52 +0800 Subject: [PATCH 52/52] Ck tile/smoothquant out stride (#1742) * add ck_tile/smoothquant out stride parameter * Remove the default stride value --------- Co-authored-by: so --- .../12_smoothquant/example_smoothquant.cpp | 44 +++++++++++-------- .../ck_tile/12_smoothquant/smoothquant.cpp | 44 +++++++++++-------- .../smoothquant/kernel/smoothquant_kernel.hpp | 20 ++++++--- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp index 3a26eb6a77..aa1d1adfd1 100644 --- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp +++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp @@ -35,7 +35,8 @@ auto create_args(int argc, char* argv[]) ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") .insert("n", "4096", "n dimension") - .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("x_stride", "-1", "input stride per row, if -1 then equal to n") + .insert("y_stride", "-1", "output stride per row, if -1 then equal to n") .insert("e", "1e-5", "epsilon") .insert("v", "1", "cpu validation or not") .insert("prec", "fp16", "precision") @@ -49,11 +50,14 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t stride = arg_parser.get_int("stride"); - if(stride < 0) - stride = n; + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; std::string data_type = arg_parser.get_str("prec"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); @@ -68,14 +72,14 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = float; // host verify - ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); ck_tile::HostTensor xscale_host({n}); ck_tile::HostTensor yscale_host_ref({m}, {1}); ck_tile::HostTensor yscale_host_dev({m}, {1}); - ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); - ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {y_stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); @@ -116,7 +120,8 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.GetDeviceBuffer(), m, n, - stride}; + x_stride, + y_stride}; auto kargs = Kernel::MakeKargs(args); @@ -133,7 +138,7 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { using YDataType = ComputeDataType; - ck_tile::HostTensor y_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_host({m, n}, {y_stride, 1}); // smooth outlier { auto f = [&](auto n_) { @@ -183,7 +188,7 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.FromDevice(qy_host_dev.data()); auto [rtol, atol] = get_elimit(); - if(stride == n) + if(y_stride == n) { pass = ck_tile::check_err(qy_host_dev, qy_host_ref, @@ -195,10 +200,12 @@ bool run(const ck_tile::ArgParser& arg_parser) { for(int i_r = 0; i_r < m; i_r++) { - std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, - qy_host_dev.begin() + i_r * stride + n); - std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, - qy_host_ref.begin() + i_r * stride + n); + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride, + qy_host_dev.begin() + i_r * y_stride + + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride, + qy_host_ref.begin() + i_r * y_stride + + n); pass &= ck_tile::check_err(qy_host_dev_row, qy_host_ref_row, std::string("qy[") + std::to_string(i_r) + @@ -210,8 +217,9 @@ bool run(const ck_tile::ArgParser& arg_parser) } std::cout << "[" << data_type << "]" - << " m:" << m << ", n:" << n << ", stride:" << stride - << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + << " m:" << m << ", n:" << n << ", x_stride:" << x_stride + << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush + << std::endl; } return pass; diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp index ed01d654fd..fd1c4ec7b4 100644 --- a/example/ck_tile/12_smoothquant/smoothquant.cpp +++ b/example/ck_tile/12_smoothquant/smoothquant.cpp @@ -33,7 +33,8 @@ auto create_args(int argc, char* argv[]) ck_tile::ArgParser arg_parser; arg_parser.insert("m", "3328", "m dimension") .insert("n", "4096", "n dimension") - .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("x_stride", "-1", "input stride per row, if -1 then equal to n") + .insert("y_stride", "-1", "output stride per row, if -1 then equal to n") .insert("v", "1", "cpu validation or not") .insert("kname", "1", "print kernel name or not") .insert("prec", "fp16", "precision") @@ -47,18 +48,21 @@ auto create_args(int argc, char* argv[]) template bool run(const ck_tile::ArgParser& arg_parser) { - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t stride = arg_parser.get_int("stride"); - if(stride < 0) - stride = n; + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; std::string data_type = arg_parser.get_str("prec"); int kname = arg_parser.get_int("kname"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - assert(stride >= n); + assert(x_stride >= n); using TypeConfig = SmoothquantTypeConfig; @@ -69,14 +73,14 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = typename TypeConfig::ComputeDataType; // host verify - ck_tile::HostTensor x_host({m, n}, {stride, 1}); + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); ck_tile::HostTensor xscale_host({n}); ck_tile::HostTensor yscale_host_ref({m}, {1}); ck_tile::HostTensor yscale_host_dev({m}, {1}); - ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); - ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); + ck_tile::HostTensor qy_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor qy_host_dev({m, n}, {y_stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{1e-3, .5f}(xscale_host); @@ -90,7 +94,8 @@ bool run(const ck_tile::ArgParser& arg_parser) xscale_buf.ToDevice(xscale_host.data()); std::cout << "[" << data_type << "]" - << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride + << std::flush; smoothquant_traits traits{data_type}; @@ -100,7 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.GetDeviceBuffer(), m, n, - stride}; + x_stride, + y_stride}; float ave_time = smoothquant( traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); @@ -116,7 +122,7 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { using YDataType = ComputeDataType; - ck_tile::HostTensor y_host({m, n}, {stride, 1}); + ck_tile::HostTensor y_host({m, n}, {y_stride, 1}); // smooth outlier { auto f = [&](auto n_) { @@ -166,7 +172,7 @@ bool run(const ck_tile::ArgParser& arg_parser) qy_buf.FromDevice(qy_host_dev.data()); auto [rtol, atol] = get_elimit(); - if(stride == n) + if(y_stride == n) { pass = ck_tile::check_err(qy_host_dev, qy_host_ref, @@ -178,10 +184,12 @@ bool run(const ck_tile::ArgParser& arg_parser) { for(int i_r = 0; i_r < m; i_r++) { - std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, - qy_host_dev.begin() + i_r * stride + n); - std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, - qy_host_ref.begin() + i_r * stride + n); + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride, + qy_host_dev.begin() + i_r * y_stride + + n); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride, + qy_host_ref.begin() + i_r * y_stride + + n); pass &= ck_tile::check_err(qy_host_dev_row, qy_host_ref_row, std::string("qy[") + std::to_string(i_r) + diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp index 6ec3335168..0b3d9d6ca9 100644 --- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp +++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp @@ -19,7 +19,8 @@ struct SmoothquantHostArgs index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // input row_stride + index_t y_stride; // output row_stride }; // TODO: Extract some type to wrapper class @@ -58,14 +59,21 @@ struct Smoothquant index_t m; index_t n; - index_t stride; // row_stride + index_t x_stride; // input row_stride + index_t y_stride; // out row_stride }; using Hargs = SmoothquantHostArgs; CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs) { - return Kargs{ - hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride}; + return Kargs{hargs.p_x, + hargs.p_xscale, + hargs.p_yscale, + hargs.p_qy, + hargs.m, + hargs.n, + hargs.x_stride, + hargs.y_stride}; } CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs) @@ -116,7 +124,7 @@ struct Smoothquant const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_x), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.x_stride, 1), number{}, number<1>{}); @@ -157,7 +165,7 @@ struct Smoothquant auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_qy), make_tuple(kargs.m, kargs.n), - make_tuple(kargs.stride, 1), + make_tuple(kargs.y_stride, 1), number{}, number<1>{});