mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Merge branch 'master' into dev
This commit is contained in:
54
.travis.yml
54
.travis.yml
@@ -1,40 +1,50 @@
|
||||
language: c
|
||||
sudo: required
|
||||
dist: trusty
|
||||
dist: focal
|
||||
matrix:
|
||||
include:
|
||||
# full testsuite (all tests except for mixed datatype)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# mixed-datatype testsuite (gemm_nn only)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# salt testsuite (fast set of operations+parameters)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# test x86_64 ukrs with SDE
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64"
|
||||
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# openmp build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto"
|
||||
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# pthreads build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto"
|
||||
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# out-of-tree build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# clang build
|
||||
- os: linux
|
||||
compiler: clang
|
||||
env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto"
|
||||
# There seems to be some difficulty installing 2 Clang toolchains of different versions.
|
||||
# Use the TravisCI default.
|
||||
# PACKAGES="clang-8 binutils"
|
||||
# macOS with system compiler (clang)
|
||||
- os: osx
|
||||
compiler: clang
|
||||
@@ -43,29 +53,23 @@ matrix:
|
||||
- os: linux
|
||||
compiler: arm-linux-gnueabihf-gcc
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \
|
||||
PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \
|
||||
PACKAGES="gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/"
|
||||
# cortexa57 build and fast testsuite (qemu)
|
||||
- os: linux
|
||||
compiler: aarch64-linux-gnu-gcc
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \
|
||||
PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \
|
||||
PACKAGES="gcc-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
|
||||
# armsve build and fast testsuite (qemu)
|
||||
- os: linux
|
||||
compiler: aarch64-linux-gnu-gcc-10
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \
|
||||
PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
|
||||
install:
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi
|
||||
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi
|
||||
- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-6
|
||||
- binutils-2.26
|
||||
- clang
|
||||
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
|
||||
- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
|
||||
script:
|
||||
- export DIST_PATH=.
|
||||
- pwd
|
||||
@@ -76,5 +80,7 @@ script:
|
||||
- ls -l
|
||||
- $CC --version
|
||||
- make -j 2
|
||||
# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
|
||||
- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
|
||||
- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
|
||||
- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
|
||||
|
||||
1
CREDITS
1
CREDITS
@@ -104,6 +104,7 @@ but many others have contributed code and feedback, including
|
||||
Costas Yamin @cosstas
|
||||
Chenhan Yu @ChenhanYu (The University of Texas at Austin)
|
||||
Roman Yurchak @rth (Symerio)
|
||||
Stefano Zampini @stefanozampini
|
||||
M. Zhou @cdluminate
|
||||
|
||||
BLIS's development was partially funded by grants from industry
|
||||
|
||||
2
Makefile
2
Makefile
@@ -461,7 +461,7 @@ endif
|
||||
|
||||
flat-header: check-env $(BLIS_H_FLAT)
|
||||
|
||||
$(BLIS_H_FLAT): $(FRAME_H99_FILES)
|
||||
$(BLIS_H_FLAT): $(ALL_H99_FILES)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
|
||||
else
|
||||
|
||||
19
README.md
19
README.md
@@ -13,6 +13,7 @@ Contents
|
||||
* **[Key Features](#key-features)**
|
||||
* **[How to Download BLIS](#how-to-download-blis)**
|
||||
* **[Getting Started](#getting-started)**
|
||||
* **[Performance](#performance)**
|
||||
* **[Documentation](#documentation)**
|
||||
* **[External Packages](#external-packages)**
|
||||
* **[Discussion](#discussion)**
|
||||
@@ -393,6 +394,24 @@ If/when you have time, we *strongly* encourage you to read the detailed
|
||||
walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
|
||||
guide.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
We provide graphs that report performance of several implementations across a
|
||||
range of hardware types, multithreading configurations, problem sizes,
|
||||
operations, and datatypes. These pages also document most of the details needed
|
||||
to reproduce these experiments.
|
||||
|
||||
* **[Performance](docs/Performance.md).** This document reports empirically
|
||||
measured performance of a representative set of level-3 operations on a variety
|
||||
of hardware architectures, as implemented within BLIS and other BLAS libraries
|
||||
for all four of the standard floating-point datatypes.
|
||||
|
||||
* **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
|
||||
empirically measured performance of `gemm` on select hardware architectures
|
||||
within BLIS and other BLAS libraries when performing matrix problems where one
|
||||
or two dimensions is exceedingly small.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
|
||||
@@ -202,12 +202,6 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
|
||||
files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
|
||||
files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))
|
||||
|
||||
# Define a function that removes duplicate words from a list.
|
||||
# NOTE: This function was obtained via [1]; thanks bobbogo for this
|
||||
# concise definition.
|
||||
# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting
|
||||
rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1)))
|
||||
|
||||
|
||||
#
|
||||
# --- Include makefile configuration file --------------------------------------
|
||||
|
||||
117
config/a64fx/bli_a64fx_sector_cache.h
Normal file
117
config/a64fx/bli_a64fx_sector_cache.h
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// A64FX: set up cache sizes
|
||||
//
|
||||
// Reference: A64FX (TM) specification Fujitsu HPC Extension
|
||||
// Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
|
||||
//
|
||||
// 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 |
|
||||
// RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
|
||||
//
|
||||
// the bits set number of maximum sectors from 0-7
|
||||
// 000 - 0
|
||||
// 001 - 1
|
||||
// 010 - 2
|
||||
// 011 - 3
|
||||
// 100 - 4
|
||||
// 101 - 5
|
||||
// 110 - 6
|
||||
// 111 - 7
|
||||
//
|
||||
// For L1 we want to maximize the number of sectors for B
|
||||
// Configuration 1: 1 sector for C (sector 3)
|
||||
// 1 sector for A (sector 1)
|
||||
// 6 sectors for B (sector 2)
|
||||
// 0 sectors for the rest (sector 0)
|
||||
//
|
||||
// 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
|
||||
//
|
||||
// Configuration 2: 1 sector for C (sector 3)
|
||||
// 1 sector for A (sector 1)
|
||||
// 5 sectors for B (sector 2)
|
||||
// 1 sectors for the rest (sector 0)
|
||||
//
|
||||
// 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
|
||||
//
|
||||
// accessing the control register:
|
||||
//
|
||||
// MRS <Xt>, S3_3_C11_C8_2
|
||||
// MSR S3_3_C11_C8_2, <Xt>
|
||||
//
|
||||
// TODO: First tests showed no change in performance, a deeper investigation
|
||||
// is necessary
|
||||
#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
|
||||
{\
|
||||
uint64_t sector_cache_config = config_bitfield;\
|
||||
__asm__ volatile(\
|
||||
"msr s3_3_c11_c8_2,%[sector_cache_config]"\
|
||||
:\
|
||||
: [sector_cache_config] "r" (sector_cache_config)\
|
||||
:\
|
||||
);\
|
||||
}
|
||||
|
||||
#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
|
||||
{\
|
||||
uint64_t sector_cache_config = config_bitfield;\
|
||||
__asm__ volatile(\
|
||||
"msr s3_3_c15_c8_2,%[sector_cache_config]"\
|
||||
:\
|
||||
: [sector_cache_config] "r" (sector_cache_config)\
|
||||
:\
|
||||
);\
|
||||
}
|
||||
|
||||
|
||||
#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
|
||||
" mov "#sparereg", "#tag" \n\t"\
|
||||
" lsl "#sparereg", "#sparereg", 56 \n\t"\
|
||||
" orr "#areg", "#areg", "#sparereg" \n\t"
|
||||
|
||||
#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
|
||||
__asm__ volatile(\
|
||||
"mrs %["#output_uint64"],s3_3_c11_c8_2"\
|
||||
: [output_uint64] "=r" (output_uint64)\
|
||||
: \
|
||||
:\
|
||||
);
|
||||
|
||||
#define A64FX_SCC(sec0,sec1,sec2,sec3)\
|
||||
(uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
|
||||
|
||||
#define A64FX_SCC_L2(sec02,sec13)\
|
||||
(uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))
|
||||
|
||||
151
config/a64fx/bli_cntx_init_a64fx.c
Normal file
151
config/a64fx/bli_cntx_init_a64fx.c
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "bli_a64fx_sector_cache.h"
|
||||
|
||||
void bli_cntx_init_a64fx( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_a64fx_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set SVE-512 packing routine.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
3,
|
||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
|
||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
4,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Set A64FX cache sector sizes for each PE/CMG
|
||||
// SC Fugaku might disable users' setting cache sizes.
|
||||
#if !defined(CACHE_SECTOR_SIZE_READONLY)
|
||||
#pragma omp parallel
|
||||
{
|
||||
A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
|
||||
A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
46
config/a64fx/bli_family_a64fx.h
Normal file
46
config/a64fx/bli_family_a64fx.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 256
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
82
config/a64fx/make_defs.mk
Normal file
82
config/a64fx/make_defs.mk
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := a64fx
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CKVECFLAGS :=
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
92
config/armsve/bli_armsve_config_utils.c
Normal file
92
config/armsve/bli_armsve_config_utils.c
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
dim_t bli_vl_bits_armsve(void)
|
||||
{ \
|
||||
uint64_t vl = 0;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incb x0 \n\t"
|
||||
" mov %[vl], x0 \n\t"
|
||||
: [vl] "=r" (vl)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
return vl;
|
||||
}
|
||||
|
||||
|
||||
#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
|
||||
void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
|
||||
dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
|
||||
{ \
|
||||
dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
|
||||
dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
|
||||
dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
|
||||
dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
|
||||
dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
|
||||
dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
|
||||
dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
|
||||
dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
|
||||
dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
|
||||
\
|
||||
dim_t vl_b = bli_vl_bits_armsve(); \
|
||||
dim_t vl = vl_b / S_Data; \
|
||||
dim_t m_r = 2 * vl; \
|
||||
dim_t n_r = 10; \
|
||||
\
|
||||
dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
|
||||
/ (n_r * S_Data); \
|
||||
\
|
||||
dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
|
||||
dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
|
||||
m_c -= m_c % m_r; \
|
||||
\
|
||||
dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
|
||||
dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
|
||||
n_c -= n_c % n_r; \
|
||||
\
|
||||
*m_r_ = m_r; \
|
||||
*n_r_ = n_r; \
|
||||
*k_c_ = k_c; \
|
||||
*m_c_ = m_c; \
|
||||
*n_c_ = n_c; \
|
||||
}
|
||||
|
||||
EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
|
||||
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
|
||||
|
||||
42
config/armsve/bli_armsve_config_utils.h
Normal file
42
config/armsve/bli_armsve_config_utils.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
dim_t bli_vl_bits_armsve(void);
|
||||
|
||||
void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
|
||||
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
|
||||
|
||||
157
config/armsve/bli_cntx_init_armsve.c
Normal file
157
config/armsve/bli_cntx_init_armsve.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "bli_armsve_config_utils.h"
|
||||
|
||||
void bli_cntx_init_armsve( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
#if 0
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
#endif
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_armsve_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Block size.
|
||||
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
|
||||
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
|
||||
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
|
||||
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set VL-specific packing routines if applicable.
|
||||
if (m_r_d==16)
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
3,
|
||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
|
||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||
cntx
|
||||
);
|
||||
else if (m_r_d==8)
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
4,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
56
config/armsve/bli_family_armsve.h
Normal file
56
config/armsve/bli_family_armsve.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 256
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
// SVE-specific configs.
|
||||
#define N_L1_SVE_DEFAULT 64
|
||||
#define W_L1_SVE_DEFAULT 4
|
||||
#define C_L1_SVE_DEFAULT 256
|
||||
#define N_L2_SVE_DEFAULT 2048
|
||||
#define W_L2_SVE_DEFAULT 16
|
||||
#define C_L2_SVE_DEFAULT 256
|
||||
#define N_L3_SVE_DEFAULT 8192
|
||||
#define W_L3_SVE_DEFAULT 16
|
||||
#define C_L3_SVE_DEFAULT 256
|
||||
|
||||
//#endif
|
||||
|
||||
82
config/armsve/make_defs.mk
Normal file
82
config/armsve/make_defs.mk
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := armsve
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS := -D_GNU_SOURCE
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CKVECFLAGS :=
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -32,6 +32,8 @@ piledriver: piledriver
|
||||
bulldozer: bulldozer
|
||||
|
||||
# ARM architectures.
|
||||
armsve: armsve/armsve
|
||||
a64fx: a64fx/armsve
|
||||
thunderx2: thunderx2/armv8a
|
||||
cortexa57: cortexa57/armv8a
|
||||
cortexa53: cortexa53/armv8a
|
||||
|
||||
5
configure
vendored
5
configure
vendored
@@ -2373,6 +2373,11 @@ main()
|
||||
fi
|
||||
|
||||
echo "${script_name}: using '${found_cc}' C compiler."
|
||||
|
||||
# Also check the compiler to see if we are (cross-)compiling for Windows
|
||||
if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
|
||||
is_win=yes
|
||||
fi
|
||||
|
||||
|
||||
# -- Find a C++ compiler ---------------------------------------------------
|
||||
|
||||
@@ -154,7 +154,7 @@ Originally, BLIS did indeed require the application to explicitly setup (initial
|
||||
|
||||
Yes! BLIS supports multithreading (via OpenMP or POSIX threads) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide.
|
||||
|
||||
BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives is thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
|
||||
BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives its thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
|
||||
|
||||
### Does BLIS support NUMA environments?
|
||||
|
||||
|
||||
@@ -21,6 +21,9 @@
|
||||
* **[Zen2](Performance.md#zen2)**
|
||||
* **[Experiment details](Performance.md#zen2-experiment-details)**
|
||||
* **[Results](Performance.md#zen2-results)**
|
||||
* **[A64fx](Performance.md#a64fx)**
|
||||
* **[Experiment details](Performance.md#a64fx-experiment-details)**
|
||||
* **[Results](Performance.md#a64fx-results)**
|
||||
* **[Feedback](Performance.md#feedback)**
|
||||
|
||||
# Introduction
|
||||
@@ -526,6 +529,78 @@ The `runthese.m` file will contain example invocations of the function.
|
||||
|
||||
---
|
||||
|
||||
## A64fx
|
||||
|
||||
### A64fx experiment details
|
||||
|
||||
* Location: RIKEN Center of Computational Science in Kobe, Japan
|
||||
* These test results were gathered on the Fugaku supercomputer under project "量子物質の創発と機能のための基礎科学 ―「富岳」と最先端実験の密連携による革新的強相関電子科学" (hp200132) (Basic Science for Emergence and Functionality in Quantum Matter: Innovative Strongly-Correlated Electron Science by Integration of "Fugaku" and Frontier Experiments)
|
||||
* Processor model: Fujitsu A64fx
|
||||
* Core topology: one socket, 4 NUMA groups per socket, 13 cores per group (one reserved for the OS), 48 cores total
|
||||
* SMT status: Unknown
|
||||
* Max clock rate: 2.2GHz (single- and multicore, observed)
|
||||
* Max vector register length: 512 bits (SVE)
|
||||
* Max FMA vector IPC: 2
|
||||
* Peak performance:
|
||||
* single-core: 70.4 GFLOPS (double-precision), 140.8 GFLOPS (single-precision)
|
||||
* multicore: 70.4 GFLOPS/core (double-precision), 140.8 GFLOPS/core (single-precision)
|
||||
* Operating system: RHEL 8.3
|
||||
* Page size: 256 bytes
|
||||
* Compiler: gcc 10.1.0
|
||||
* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021
|
||||
* Implementations tested:
|
||||
* BLIS 61584de (post-0.8.1)
|
||||
* configured with:
|
||||
* `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded)
|
||||
* `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded)
|
||||
* sub-configuration exercised: `a64fx`
|
||||
* Single-threaded (1 core) execution requested via no change in environment variables
|
||||
* Multithreaded (12 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=12`
|
||||
* Multithreaded (48 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=4 BLIS_JR_NT=12`
|
||||
* Eigen 3.3.9
|
||||
* Obtained via the [Eigen GitLab homepage](https://gitlab.com/libeigen/eigen)
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
|
||||
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
|
||||
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
|
||||
* ARMPL (20.1.0 for A64fx)
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
|
||||
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
|
||||
* **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether.
|
||||
* Fujitsu SSL2 (Fujitsu toolchain 1.2.31)
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1`
|
||||
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12`
|
||||
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48`
|
||||
* Affinity:
|
||||
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="12-23 24-35 36-47 48-59"`.
|
||||
* All executables were run through `numactl --interleave=all` (multithreaded only).
|
||||
* Frequency throttling: No change made. No frequency lowering observed.
|
||||
* Comments:
|
||||
* Special thanks to Stepan Nassyr and RuQing G. Xu for their work in developing and optimizing A64fx support. Also, thanks to RuQing G. Xu for collecting the data that appear in these graphs.
|
||||
|
||||
### A64fx results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [A64fx single-threaded](graphs/large/l3_perf_a64fx_nt1.pdf)
|
||||
* [A64fx multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf)
|
||||
* [A64fx multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **A64fx single-threaded**
|
||||

|
||||
* **A64fx multithreaded (12 cores)**
|
||||

|
||||
* **A64fx multithreaded (48 cores)**
|
||||

|
||||
|
||||
---
|
||||
|
||||
# Feedback
|
||||
|
||||
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
|
||||
|
||||
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
Normal file
Binary file not shown.
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 250 KiB |
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
Normal file
Binary file not shown.
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 260 KiB |
BIN
docs/graphs/large/l3_perf_a64fx_nt1.pdf
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_nt1.pdf
Normal file
Binary file not shown.
BIN
docs/graphs/large/l3_perf_a64fx_nt1.png
Normal file
BIN
docs/graphs/large/l3_perf_a64fx_nt1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 250 KiB |
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
else /* if ( will_pack == TRUE ) */ \
|
||||
{ \
|
||||
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||
/* NOTE: This "rounding up" of the last upanel is actually optional
|
||||
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||
the other micropanels. Why? So that millikernels can use the same
|
||||
|
||||
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
else /* if ( will_pack == TRUE ) */ \
|
||||
{ \
|
||||
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||
/* NOTE: This "rounding up" of the last upanel is actually optional
|
||||
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||
the other micropanels. Why? So that millikernels can use the same
|
||||
@@ -280,15 +280,15 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* All other stor3_t ids: pack A to column-stored row-panels. */ \
|
||||
/* All other stor3_t ids: pack B to row-stored column-panels. */ \
|
||||
*rs_p = nr; \
|
||||
*cs_p = 1; \
|
||||
\
|
||||
*pd_p = nr; \
|
||||
*ps_p = k * nr; \
|
||||
\
|
||||
/* Set the schema to "packed row panels" to indicate packing to
|
||||
conventional column-stored row panels. */ \
|
||||
/* Set the schema to "packed column panels" to indicate packing to
|
||||
conventional row-stored column panels. */ \
|
||||
*schema = BLIS_PACKED_COL_PANELS; \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
|
||||
#endif
|
||||
|
||||
// ARM microarchitectures.
|
||||
#ifdef BLIS_FAMILY_ARMSVE
|
||||
id = BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_A64FX
|
||||
id = BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_THUNDERX2
|
||||
id = BLIS_ARCH_THUNDERX2;
|
||||
#endif
|
||||
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
|
||||
"thunderx2",
|
||||
"cortexa57",
|
||||
"cortexa53",
|
||||
"armsve",
|
||||
"a64fx",
|
||||
"cortexa15",
|
||||
"cortexa9",
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
|
||||
printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" );
|
||||
printf("family = %x\n", family );
|
||||
printf( "model = %x\n", model );
|
||||
|
||||
|
||||
printf( "features = %x\n", features );
|
||||
#endif
|
||||
|
||||
@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
|
||||
{
|
||||
// Check for each ARMv8 configuration that is enabled, check for that
|
||||
// microarchitecture. We check from most recent to most dated.
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
if ( bli_cpuid_is_armsve( model, part, features ) )
|
||||
return BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
if ( bli_cpuid_is_a64fx( model, part, features ) )
|
||||
return BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
if ( bli_cpuid_is_thunderx2( model, part, features ) )
|
||||
return BLIS_ARCH_THUNDERX2;
|
||||
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_armsve
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_a64fx
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa15
|
||||
(
|
||||
uint32_t family,
|
||||
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
|
||||
strstr( feat_str, "asimd" ) != NULL )
|
||||
*features |= FEATURE_NEON;
|
||||
|
||||
// Parse the feature string to check for SVE features.
|
||||
if ( strstr( feat_str, "sve" ) != NULL )
|
||||
*features |= FEATURE_SVE;
|
||||
|
||||
//printf( "bli_cpuid_query(): features var: %u\n", *features );
|
||||
|
||||
// Parse the processor string to uncover the model.
|
||||
|
||||
@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
|
||||
bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
|
||||
|
||||
@@ -175,7 +177,8 @@ enum
|
||||
};
|
||||
enum
|
||||
{
|
||||
FEATURE_NEON = 0x1
|
||||
FEATURE_NEON = 0x01,
|
||||
FEATURE_SVE = 0x02
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -144,6 +144,16 @@ void bli_gks_init( void )
|
||||
bli_cntx_init_cortexa53_ref,
|
||||
bli_cntx_init_cortexa53_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve,
|
||||
bli_cntx_init_armsve_ref,
|
||||
bli_cntx_init_armsve_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
|
||||
bli_cntx_init_a64fx_ref,
|
||||
bli_cntx_init_a64fx_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_CORTEXA15
|
||||
bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15,
|
||||
bli_cntx_init_cortexa15_ref,
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
/* dlamch.f -- translated by f2c (version 19991025).
|
||||
You must link the resulting object file with the libraries:
|
||||
-lf2c -lm (in that order)
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <fenv.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_LEGACY_LAMCH
|
||||
|
||||
double bli_pow_di( bla_double* a, bla_integer* n );
|
||||
|
||||
@@ -1027,6 +1029,59 @@ L10:
|
||||
|
||||
} /* bli_dlamc5_ */
|
||||
|
||||
#ifdef __cplusplus
|
||||
#else
|
||||
|
||||
bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len)
|
||||
{
|
||||
/* = 'E' or 'e', DLAMCH := eps */
|
||||
/* = 'S' or 's , DLAMCH := sfmin */
|
||||
/* = 'B' or 'b', DLAMCH := base */
|
||||
/* = 'P' or 'p', DLAMCH := eps*base */
|
||||
/* = 'N' or 'n', DLAMCH := t */
|
||||
/* = 'R' or 'r', DLAMCH := rnd */
|
||||
/* = 'M' or 'm', DLAMCH := emin */
|
||||
/* = 'U' or 'u', DLAMCH := rmin */
|
||||
/* = 'L' or 'l', DLAMCH := emax */
|
||||
/* = 'O' or 'o', DLAMCH := rmax */
|
||||
|
||||
/* where */
|
||||
|
||||
/* eps = relative machine precision */
|
||||
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
|
||||
/* base = base of the machine */
|
||||
/* prec = eps*base */
|
||||
/* t = number of (base) digits in the mantissa */
|
||||
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
|
||||
/* emin = minimum exponent before (gradual) underflow */
|
||||
/* rmin = underflow threshold - base**(emin-1) */
|
||||
/* emax = largest exponent before overflow */
|
||||
/* rmax = overflow threshold - (base**emax)*(1-eps) */
|
||||
|
||||
double safe_min = DBL_MIN;
|
||||
double small = 1.0f / DBL_MAX;
|
||||
|
||||
if ( small >= safe_min )
|
||||
safe_min = small * ( 1.0 + DBL_EPSILON );
|
||||
|
||||
switch ( toupper( *cmach ) )
|
||||
{
|
||||
case 'E': return DBL_EPSILON;
|
||||
case 'S': return safe_min;
|
||||
case 'B': return FLT_RADIX;
|
||||
case 'P': return FLT_RADIX*DBL_EPSILON;
|
||||
case 'N': return DBL_MANT_DIG;
|
||||
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0;
|
||||
case 'M': return DBL_MIN_EXP;
|
||||
case 'U': return DBL_MIN;
|
||||
case 'L': return DBL_MAX_EXP;
|
||||
case 'O': return DBL_MAX;
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
/* slamch.f -- translated by f2c (version 19991025).
|
||||
You must link the resulting object file with the libraries:
|
||||
-lf2c -lm (in that order)
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <fenv.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_LEGACY_LAMCH
|
||||
|
||||
double bli_pow_ri( bla_real* a, bla_integer* n );
|
||||
|
||||
@@ -1022,6 +1024,59 @@ L10:
|
||||
|
||||
} /* bli_slamc5_ */
|
||||
|
||||
#ifdef __cplusplus
|
||||
#else
|
||||
|
||||
bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len)
|
||||
{
|
||||
/* = 'E' or 'e', SLAMCH := eps */
|
||||
/* = 'S' or 's , SLAMCH := sfmin */
|
||||
/* = 'B' or 'b', SLAMCH := base */
|
||||
/* = 'P' or 'p', SLAMCH := eps*base */
|
||||
/* = 'N' or 'n', SLAMCH := t */
|
||||
/* = 'R' or 'r', SLAMCH := rnd */
|
||||
/* = 'M' or 'm', SLAMCH := emin */
|
||||
/* = 'U' or 'u', SLAMCH := rmin */
|
||||
/* = 'L' or 'l', SLAMCH := emax */
|
||||
/* = 'O' or 'o', SLAMCH := rmax */
|
||||
|
||||
/* where */
|
||||
|
||||
/* eps = relative machine precision */
|
||||
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
|
||||
/* base = base of the machine */
|
||||
/* prec = eps*base */
|
||||
/* t = number of (base) digits in the mantissa */
|
||||
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
|
||||
/* emin = minimum exponent before (gradual) underflow */
|
||||
/* rmin = underflow threshold - base**(emin-1) */
|
||||
/* emax = largest exponent before overflow */
|
||||
/* rmax = overflow threshold - (base**emax)*(1-eps) */
|
||||
|
||||
float safe_min = FLT_MIN;
|
||||
float small = 1.0f / FLT_MAX;
|
||||
|
||||
if ( small >= safe_min )
|
||||
safe_min = small * ( 1.0f + FLT_EPSILON );
|
||||
|
||||
switch ( toupper( *cmach ) )
|
||||
{
|
||||
case 'E': return FLT_EPSILON;
|
||||
case 'S': return safe_min;
|
||||
case 'B': return FLT_RADIX;
|
||||
case 'P': return FLT_RADIX*FLT_EPSILON;
|
||||
case 'N': return FLT_MANT_DIG;
|
||||
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f;
|
||||
case 'M': return FLT_MIN_EXP;
|
||||
case 'U': return FLT_MIN;
|
||||
case 'L': return FLT_MAX_EXP;
|
||||
case 'O': return FLT_MAX;
|
||||
}
|
||||
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )
|
||||
|
||||
// -- ARM architectures --
|
||||
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
CNTX_INIT_PROTS( armsve )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
CNTX_INIT_PROTS( a64fx )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
CNTX_INIT_PROTS( thunderx2 )
|
||||
#endif
|
||||
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- ARM architectures --
|
||||
|
||||
#ifdef BLIS_FAMILY_ARMSVE
|
||||
#include "bli_family_armsve.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_A64FX
|
||||
#include "bli_family_a64fx.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_THUNDERX2
|
||||
#include "bli_family_thunderx2.h"
|
||||
#endif
|
||||
|
||||
@@ -128,6 +128,20 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
|
||||
|
||||
|
||||
|
||||
// -- One-operand macro (with custom prefix) --
|
||||
|
||||
#define GENARRAY_PREF(arrayname,prefix,op) \
|
||||
\
|
||||
arrayname[BLIS_NUM_FP_TYPES] = \
|
||||
{ \
|
||||
PASTECH2(prefix,s,op), \
|
||||
PASTECH2(prefix,c,op), \
|
||||
PASTECH2(prefix,d,op), \
|
||||
PASTECH2(prefix,z,op) \
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- Two-operand macros --
|
||||
|
||||
|
||||
|
||||
@@ -1190,7 +1190,7 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
|
||||
// -- Initialization-related macros --
|
||||
|
||||
// Finish the initialization started by the matrix-specific static initializer
|
||||
// (e.g. BLIS_OBJECT_PREINITIALIZER)
|
||||
// (e.g. BLIS_OBJECT_INITIALIZER)
|
||||
// NOTE: This is intended only for use in the BLAS compatibility API and typed
|
||||
// BLIS API.
|
||||
|
||||
@@ -1223,7 +1223,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
|
||||
}
|
||||
|
||||
// Finish the initialization started by the 1x1-specific static initializer
|
||||
// (e.g. BLIS_OBJECT_PREINITIALIZER_1X1)
|
||||
// (e.g. BLIS_OBJECT_INITIALIZER_1X1)
|
||||
// NOTE: This is intended only for use in the BLAS compatibility API and typed
|
||||
// BLIS API.
|
||||
|
||||
|
||||
@@ -1008,6 +1008,8 @@ typedef enum
|
||||
BLIS_ARCH_BULLDOZER,
|
||||
|
||||
// ARM
|
||||
BLIS_ARCH_ARMSVE,
|
||||
BLIS_ARCH_A64FX,
|
||||
BLIS_ARCH_THUNDERX2,
|
||||
BLIS_ARCH_CORTEXA57,
|
||||
BLIS_ARCH_CORTEXA53,
|
||||
@@ -1032,7 +1034,7 @@ typedef enum
|
||||
|
||||
// NOTE: This value must be updated to reflect the number of enum values
|
||||
// listed above for arch_t!
|
||||
#define BLIS_NUM_ARCHS 22
|
||||
//#define BLIS_NUM_ARCHS 25
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -885,6 +885,8 @@
|
||||
#define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2)
|
||||
#define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2)
|
||||
#define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2)
|
||||
#define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2)
|
||||
#define VHSUBPS(_0, _1, _2) INSTR_(vhsubps, _0, _1, _2)
|
||||
#define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2)
|
||||
#define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2)
|
||||
#define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2)
|
||||
@@ -1015,6 +1017,8 @@
|
||||
#define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2)
|
||||
#define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2)
|
||||
#define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2)
|
||||
#define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2)
|
||||
#define vhsubps(_0, _1, _2) VHSUBPS(_0, _1, _2)
|
||||
#define vaddps(_0, _1, _2) VADDPS(_0, _1, _2)
|
||||
#define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2)
|
||||
#define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2)
|
||||
|
||||
45
kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
Normal file
45
kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
|
||||
"trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
|
||||
"trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
|
||||
"compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
|
||||
"compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
|
||||
"compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
|
||||
"compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
|
||||
"compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
|
||||
"compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"
|
||||
|
||||
97
kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
Normal file
97
kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
|
||||
"ptrue " #PT".d \n\t" \
|
||||
"mov " #XTMP", #2 \n\t" \
|
||||
"whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
|
||||
"mov " #XTMP", #4 \n\t" \
|
||||
"whilelo " #P4".d, xzr, " #XTMP" \n\t" \
|
||||
"mov " #XTMP", #6 \n\t" \
|
||||
"whilelo " #P6".d, xzr, " #XTMP" \n\t" \
|
||||
\
|
||||
"eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
|
||||
"orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
|
||||
\
|
||||
"not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
|
||||
"not " #P4C".b, " #PT"/z, " #P4".b \n\t" \
|
||||
"not " #P6C".b, " #PT"/z, " #P6".b \n\t" \
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
|
||||
"trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
|
||||
"trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
|
||||
"trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
|
||||
"trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
|
||||
"trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
|
||||
"trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
|
||||
"trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
|
||||
"trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
|
||||
\
|
||||
"compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
|
||||
"compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
|
||||
"ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
|
||||
"ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
|
||||
"compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
|
||||
"compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
|
||||
"ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
|
||||
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
|
||||
\
|
||||
"sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
|
||||
"sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
|
||||
"sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
|
||||
"sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
|
||||
"sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
|
||||
"sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
|
||||
"sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
|
||||
"sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
|
||||
\
|
||||
"compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
|
||||
"compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
|
||||
"compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
|
||||
"compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
|
||||
"ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
|
||||
"ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
|
||||
"ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
|
||||
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
|
||||
\
|
||||
"sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
|
||||
"sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
|
||||
"sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
|
||||
"sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
|
||||
"sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
|
||||
"sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
|
||||
"sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
|
||||
"sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"
|
||||
|
||||
@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
double* a = ( double* )a_;
|
||||
double* p = ( double* )p_;
|
||||
double* kappa = ( double* )kappa_;
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 8;
|
||||
const int64_t n = n_;
|
||||
|
||||
365
kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
Normal file
365
kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
Normal file
@@ -0,0 +1,365 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "armsve512_asm_transpose_d8x8.h"
|
||||
#include "armsve512_asm_transpose_d8x2.h"
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
|
||||
void bli_dpackm_armsve512_asm_10xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 10;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
const bool gs = inca != 1 && lda != 1;
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( cdim == mnr && !gs && unitk )
|
||||
{
|
||||
uint64_t n_mker = n / 8;
|
||||
uint64_t n_left = n % 8;
|
||||
__asm__ volatile (
|
||||
"mov x0, %[a] \n\t"
|
||||
"mov x1, %[p] \n\t"
|
||||
"mov x2, %[ldp] \n\t"
|
||||
"mov x3, %[lda] \n\t"
|
||||
"mov x4, %[inca] \n\t"
|
||||
"cmp x4, #1 \n\t"
|
||||
// Skips by sizeof(double).
|
||||
"mov x8, #8 \n\t"
|
||||
"madd x2, x2, x8, xzr \n\t"
|
||||
"madd x3, x3, x8, xzr \n\t"
|
||||
"madd x4, x4, x8, xzr \n\t"
|
||||
// Loop constants.
|
||||
"mov x8, %[n_mker] \n\t"
|
||||
"mov x9, %[n_left] \n\t"
|
||||
"ptrue p0.d \n\t"
|
||||
"b.ne .AROWSTOR \n\t"
|
||||
// A stored in columns.
|
||||
" .ACOLSTOR: \n\t"
|
||||
// Prefetch distance.
|
||||
"mov x17, #8 \n\t"
|
||||
"madd x17, x17, x3, xzr \n\t"
|
||||
#ifdef _A64FX
|
||||
// Disable hardware prefetch for A.
|
||||
"mov x16, 0x6 \n\t"
|
||||
"lsl x16, x16, #60 \n\t"
|
||||
"orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" .ACOLSTORMKER: \n\t"
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .ACOLSTORMKEREND \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ldr q1, [x0, #64] \n\t"
|
||||
"ld1d z2.d, p0/z, [x5] \n\t"
|
||||
"ldr q3, [x5, #64] \n\t"
|
||||
"ld1d z4.d, p0/z, [x6] \n\t"
|
||||
"ldr q5, [x6, #64] \n\t"
|
||||
"ld1d z6.d, p0/z, [x7] \n\t"
|
||||
"ldr q7, [x7, #64] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z8.d, p0/z, [x0] \n\t"
|
||||
"ldr q9, [x0, #64] \n\t"
|
||||
"ld1d z10.d, p0/z, [x5] \n\t"
|
||||
"ldr q11, [x5, #64] \n\t"
|
||||
"ld1d z12.d, p0/z, [x6] \n\t"
|
||||
"ldr q13, [x6, #64] \n\t"
|
||||
"ld1d z14.d, p0/z, [x7] \n\t"
|
||||
"ldr q15, [x7, #64] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
// Plain storage
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"st1d z2.d, p0, [x10] \n\t"
|
||||
"str q3, [x10, #64] \n\t"
|
||||
"st1d z4.d, p0, [x11] \n\t"
|
||||
"str q5, [x11, #64] \n\t"
|
||||
"st1d z6.d, p0, [x12] \n\t"
|
||||
"str q7, [x12, #64] \n\t"
|
||||
"st1d z8.d, p0, [x13] \n\t"
|
||||
"str q9, [x13, #64] \n\t"
|
||||
"st1d z10.d, p0, [x14] \n\t"
|
||||
"str q11, [x14, #64] \n\t"
|
||||
"st1d z12.d, p0, [x15] \n\t"
|
||||
"str q13, [x15, #64] \n\t"
|
||||
"st1d z14.d, p0, [x16] \n\t"
|
||||
"str q15, [x16, #64] \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
// Realign and store.
|
||||
// "ext z1.b, z1.b, z1.b, #16 \n\t"
|
||||
// "ext z1.b, z1.b, z2.b, #48 \n\t"
|
||||
// "ext z2.b, z2.b, z3.b, #16 \n\t"
|
||||
// "ext z2.b, z2.b, z4.b, #32 \n\t"
|
||||
// "ext z4.b, z4.b, z5.b, #16 \n\t"
|
||||
// "ext z4.b, z4.b, z6.b, #16 \n\t"
|
||||
// "ext z6.b, z6.b, z7.b, #16 \n\t"
|
||||
// "ext z9.b, z9.b, z9.b, #16 \n\t"
|
||||
// "ext z9.b, z9.b, z10.b, #48 \n\t"
|
||||
// "ext z10.b, z10.b, z11.b, #16 \n\t"
|
||||
// "ext z10.b, z10.b, z12.b, #32 \n\t"
|
||||
// "ext z12.b, z12.b, z13.b, #16 \n\t"
|
||||
// "ext z12.b, z12.b, z14.b, #16 \n\t"
|
||||
// "ext z14.b, z14.b, z15.b, #16 \n\t"
|
||||
// "st1d z0.d, p0, [x1] \n\t"
|
||||
// "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
// "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
|
||||
// "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
|
||||
// "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
|
||||
// "add x1, x1, #320 \n\t"
|
||||
// "st1d z8.d, p0, [x1] \n\t"
|
||||
// "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
|
||||
// "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
|
||||
// "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
|
||||
// "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
|
||||
// "add x1, x1, #320 \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .ACOLSTORMKER \n\t"
|
||||
" .ACOLSTORMKEREND: \n\t"
|
||||
" .ACOLSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ldr q1, [x0, #64] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"add x0, x0, x3 \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .ACOLSTORLEFT \n\t"
|
||||
// A stored in rows.
|
||||
" .AROWSTOR: \n\t"
|
||||
// Prepare predicates for in-reg transpose.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
|
||||
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .AROWSTORMKEREND \n\t"
|
||||
"add x10, x0, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"add x17, x16, x4 \n\t"
|
||||
"add x18, x17, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x10] \n\t"
|
||||
"ld1d z2.d, p0/z, [x11] \n\t"
|
||||
"ld1d z3.d, p0/z, [x12] \n\t"
|
||||
"ld1d z4.d, p0/z, [x13] \n\t"
|
||||
"ld1d z5.d, p0/z, [x14] \n\t"
|
||||
"ld1d z6.d, p0/z, [x15] \n\t"
|
||||
"ld1d z7.d, p0/z, [x16] \n\t"
|
||||
"ld1d z22.d, p0/z, [x17] \n\t"
|
||||
"ld1d z23.d, p0/z, [x18] \n\t"
|
||||
// Transpose first 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
|
||||
// Transpose last 2 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
|
||||
// Plain storage.
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z8.d, p0, [x1] \n\t"
|
||||
"str q16, [x1, #64] \n\t"
|
||||
"st1d z9.d, p0, [x10] \n\t"
|
||||
"str q17, [x10, #64] \n\t"
|
||||
"st1d z10.d, p0, [x11] \n\t"
|
||||
"str q18, [x11, #64] \n\t"
|
||||
"st1d z11.d, p0, [x12] \n\t"
|
||||
"str q19, [x12, #64] \n\t"
|
||||
"st1d z12.d, p0, [x13] \n\t"
|
||||
"str q20, [x13, #64] \n\t"
|
||||
"st1d z13.d, p0, [x14] \n\t"
|
||||
"str q21, [x14, #64] \n\t"
|
||||
"st1d z14.d, p0, [x15] \n\t"
|
||||
"str q22, [x15, #64] \n\t"
|
||||
"st1d z15.d, p0, [x16] \n\t"
|
||||
"str q23, [x16, #64] \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"add x0, x0, #64 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .AROWSTORMKER \n\t"
|
||||
" .AROWSTORMKEREND: \n\t"
|
||||
"mov x4, %[inca] \n\t" // Restore unshifted inca.
|
||||
"index z30.d, xzr, x4 \n\t" // Generate index.
|
||||
"lsl x4, x4, #3 \n\t" // Shift again.
|
||||
"lsl x5, x4, #3 \n\t" // Virtual column vl.
|
||||
" .AROWSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"add x6, x0, x5 \n\t"
|
||||
"add x7, x6, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
|
||||
"ldr d1, [x6] \n\t"
|
||||
"ldr d2, [x7] \n\t"
|
||||
"trn1 v1.2d, v1.2d, v2.2d \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"add x0, x0, #8 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .AROWSTORLEFT \n\t"
|
||||
" .UNITKDONE: \n\t"
|
||||
"mov x0, #0 \n\t"
|
||||
:
|
||||
: [a] "r" (a),
|
||||
[p] "r" (p),
|
||||
[lda] "r" (lda),
|
||||
[ldp] "r" (ldp),
|
||||
[inca] "r" (inca),
|
||||
[n_mker] "r" (n_mker),
|
||||
[n_left] "r" (n_left)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14","x15",
|
||||
"x16","x17","x18",
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
|
||||
"z8", "z9", "z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19","z20","z21","z22","z23",
|
||||
// "z24","z25","z26","z27","z28","z29",
|
||||
"z30","z31",
|
||||
"p0", "p1", "p2", "p3", "p4", // "p5",
|
||||
"p6", "p7", "p8"
|
||||
);
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
359
kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
Normal file
359
kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
Normal file
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#else
|
||||
#error "No Arm SVE intrinsics support in compiler"
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
// TODO:
|
||||
// 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
|
||||
// prefetching is needed.
|
||||
|
||||
void bli_dpackm_armsve512_asm_12xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 12;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
double* restrict alpha1 = a;
|
||||
double* restrict alpha1_8 = alpha1 + 8 * inca;
|
||||
double* restrict alpha1_p4 = alpha1 + 4 * inca;
|
||||
double* restrict alpha1_m4 = alpha1 - 4 * inca;
|
||||
double* restrict pi1 = p;
|
||||
const svbool_t all_active = svptrue_b64();
|
||||
const svbool_t first_half_active = svwhilelt_b64(0, 4);
|
||||
const svbool_t last_half_active = svnot_z(all_active, first_half_active);
|
||||
svfloat64_t z_a0;
|
||||
svfloat64_t z_a8;
|
||||
svfloat64_t z_a8_lh;
|
||||
svfloat64_t z_a16;
|
||||
svuint64_t z_index;
|
||||
|
||||
// creating index for gather/scatter
|
||||
// with each element as: 0, 1*inca, 2*inca, 3*inca
|
||||
z_index = svindex_u64( 0, inca * sizeof( double ) );
|
||||
|
||||
if ( cdim == mnr )
|
||||
{
|
||||
if ( bli_deq1( *kappa ) )
|
||||
{
|
||||
if ( inca == 1 ) // continous memory. packA style
|
||||
{
|
||||
dim_t k = n;
|
||||
// 2 pack into 3 case.
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// load 12 continuous elments from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_f64( all_active, alpha1_p4 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
// line-by-line packing case.
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// store them into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
else // gather/scatter load/store. packB style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// gather load from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// scatter store into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // *kappa != 1.0
|
||||
{
|
||||
// load kappa into vector
|
||||
svfloat64_t z_kappa;
|
||||
|
||||
z_kappa = svdup_f64( *kappa );
|
||||
|
||||
if ( inca == 1 ) // continous memory. packA style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// load 12 continuous elments from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_f64( all_active, alpha1_p4 );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
|
||||
// store them into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
else // gather/scatter load/store. packB style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// gather load from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
|
||||
// scatter store into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
} // end of if ( *kappa == 1.0 )
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
363
kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
Normal file
363
kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
Normal file
@@ -0,0 +1,363 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "armsve512_asm_transpose_d8x8.h"
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
|
||||
void bli_dpackm_armsve512_asm_16xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 16;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
const bool gs = inca != 1 && lda != 1;
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( cdim == mnr && !gs && unitk )
|
||||
{
|
||||
uint64_t n_mker = n / 8;
|
||||
uint64_t n_left = n % 8;
|
||||
__asm__ volatile (
|
||||
"mov x0, %[a] \n\t"
|
||||
"mov x1, %[p] \n\t"
|
||||
"mov x2, %[ldp] \n\t"
|
||||
"mov x3, %[lda] \n\t"
|
||||
"mov x4, %[inca] \n\t"
|
||||
"cmp x4, #1 \n\t"
|
||||
// Skips by sizeof(double).
|
||||
"mov x8, #8 \n\t"
|
||||
"madd x2, x2, x8, xzr \n\t"
|
||||
"madd x3, x3, x8, xzr \n\t"
|
||||
"madd x4, x4, x8, xzr \n\t"
|
||||
|
||||
// "mov x8, 0x8 \n\t" // Control#0 for A address.
|
||||
// "mov x8, 0x24 \n\t" // Higher 6bit for Control#0:
|
||||
// "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
|
||||
// "orr x8, x8, x3 \n\t" // Stride.
|
||||
// "msr S3_3_C11_C6_0, x8 \n\t" // Write system register.
|
||||
|
||||
// Loop constants.
|
||||
"mov x8, %[n_mker] \n\t"
|
||||
"mov x9, %[n_left] \n\t"
|
||||
"ptrue p0.d \n\t"
|
||||
"b.ne .AROWSTOR \n\t"
|
||||
// A stored in columns.
|
||||
" .ACOLSTOR: \n\t"
|
||||
// Prefetch distance.
|
||||
"mov x17, #8 \n\t"
|
||||
"madd x17, x17, x3, xzr \n\t"
|
||||
#ifdef _A64FX
|
||||
"mov x16, 0x6 \n\t" // Disable hardware prefetch for A.
|
||||
"lsl x16, x16, #60 \n\t"
|
||||
"orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
// "add x5, x0, x3 \n\t"
|
||||
// "add x6, x5, x3 \n\t"
|
||||
// "add x7, x6, x3 \n\t"
|
||||
// "prfm PLDL1STRM, [x0] \n\t"
|
||||
// "prfm PLDL1STRM, [x5] \n\t"
|
||||
// "prfm PLDL1STRM, [x6] \n\t"
|
||||
// "prfm PLDL1STRM, [x7] \n\t"
|
||||
// "add x18, x7, x3 \n\t"
|
||||
// "add x5, x18, x3 \n\t"
|
||||
// "add x6, x5, x3 \n\t"
|
||||
// "add x7, x6, x3 \n\t"
|
||||
// "prfm PLDL1STRM, [x18] \n\t"
|
||||
// "prfm PLDL1STRM, [x5] \n\t"
|
||||
// "prfm PLDL1STRM, [x6] \n\t"
|
||||
// "prfm PLDL1STRM, [x7] \n\t"
|
||||
" .ACOLSTORMKER: \n\t"
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .ACOLSTORMKEREND \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"ld1d z2.d, p0/z, [x5] \n\t"
|
||||
"ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
|
||||
"ld1d z4.d, p0/z, [x6] \n\t"
|
||||
"ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
|
||||
"ld1d z6.d, p0/z, [x7] \n\t"
|
||||
"ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z8.d, p0/z, [x0] \n\t"
|
||||
"ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"ld1d z10.d, p0/z, [x5] \n\t"
|
||||
"ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
|
||||
"ld1d z12.d, p0/z, [x6] \n\t"
|
||||
"ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
|
||||
"ld1d z14.d, p0/z, [x7] \n\t"
|
||||
"ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"st1d z2.d, p0, [x10] \n\t"
|
||||
"st1d z3.d, p0, [x10, #1, mul vl] \n\t"
|
||||
"st1d z4.d, p0, [x11] \n\t"
|
||||
"st1d z5.d, p0, [x11, #1, mul vl] \n\t"
|
||||
"st1d z6.d, p0, [x12] \n\t"
|
||||
"st1d z7.d, p0, [x12, #1, mul vl] \n\t"
|
||||
"st1d z8.d, p0, [x13] \n\t"
|
||||
"st1d z9.d, p0, [x13, #1, mul vl] \n\t"
|
||||
"st1d z10.d, p0, [x14] \n\t"
|
||||
"st1d z11.d, p0, [x14, #1, mul vl] \n\t"
|
||||
"st1d z12.d, p0, [x15] \n\t"
|
||||
"st1d z13.d, p0, [x15, #1, mul vl] \n\t"
|
||||
"st1d z14.d, p0, [x16] \n\t"
|
||||
"st1d z15.d, p0, [x16, #1, mul vl] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .ACOLSTORMKER \n\t"
|
||||
" .ACOLSTORMKEREND: \n\t"
|
||||
" .ACOLSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"add x0, x0, x3 \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .ACOLSTORLEFT \n\t"
|
||||
// A stored in rows.
|
||||
" .AROWSTOR: \n\t"
|
||||
// Prepare predicates for in-reg transpose.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
|
||||
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .AROWSTORMKEREND \n\t"
|
||||
"add x10, x0, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x10] \n\t"
|
||||
"ld1d z2.d, p0/z, [x11] \n\t"
|
||||
"ld1d z3.d, p0/z, [x12] \n\t"
|
||||
"ld1d z4.d, p0/z, [x13] \n\t"
|
||||
"ld1d z5.d, p0/z, [x14] \n\t"
|
||||
"ld1d z6.d, p0/z, [x15] \n\t"
|
||||
"ld1d z7.d, p0/z, [x16] \n\t"
|
||||
"add x5, x16, x4 \n\t"
|
||||
"add x10, x5, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"ld1d z16.d, p0/z, [x5] \n\t"
|
||||
"ld1d z17.d, p0/z, [x10] \n\t"
|
||||
"ld1d z18.d, p0/z, [x11] \n\t"
|
||||
"ld1d z19.d, p0/z, [x12] \n\t"
|
||||
"ld1d z20.d, p0/z, [x13] \n\t"
|
||||
"ld1d z21.d, p0/z, [x14] \n\t"
|
||||
"ld1d z22.d, p0/z, [x15] \n\t"
|
||||
"ld1d z23.d, p0/z, [x16] \n\t"
|
||||
// Transpose first 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
|
||||
// Transpose last 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z8.d, p0, [x1] \n\t"
|
||||
"st1d z24.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"st1d z9.d, p0, [x10] \n\t"
|
||||
"st1d z25.d, p0, [x10, #1, mul vl] \n\t"
|
||||
"st1d z10.d, p0, [x11] \n\t"
|
||||
"st1d z26.d, p0, [x11, #1, mul vl] \n\t"
|
||||
"st1d z11.d, p0, [x12] \n\t"
|
||||
"st1d z27.d, p0, [x12, #1, mul vl] \n\t"
|
||||
"st1d z12.d, p0, [x13] \n\t"
|
||||
"st1d z28.d, p0, [x13, #1, mul vl] \n\t"
|
||||
"st1d z13.d, p0, [x14] \n\t"
|
||||
"st1d z29.d, p0, [x14, #1, mul vl] \n\t"
|
||||
"st1d z14.d, p0, [x15] \n\t"
|
||||
"st1d z30.d, p0, [x15, #1, mul vl] \n\t"
|
||||
"st1d z15.d, p0, [x16] \n\t"
|
||||
"st1d z31.d, p0, [x16, #1, mul vl] \n\t"
|
||||
"add x0, x0, #64 \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .AROWSTORMKER \n\t"
|
||||
" .AROWSTORMKEREND: \n\t"
|
||||
"mov x4, %[inca] \n\t" // Restore unshifted inca.
|
||||
"index z30.d, xzr, x4 \n\t" // Generate index.
|
||||
"lsl x4, x4, #3 \n\t" // Shift again.
|
||||
"lsl x5, x4, #3 \n\t" // Virtual column vl.
|
||||
" .AROWSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"add x6, x0, x5 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
|
||||
"ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"add x0, x0, #8 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .AROWSTORLEFT \n\t"
|
||||
" .UNITKDONE: \n\t"
|
||||
"mov x0, #0 \n\t"
|
||||
:
|
||||
: [a] "r" (a),
|
||||
[p] "r" (p),
|
||||
[lda] "r" (lda),
|
||||
[ldp] "r" (ldp),
|
||||
[inca] "r" (inca),
|
||||
[n_mker] "r" (n_mker),
|
||||
[n_left] "r" (n_left)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14","x15",
|
||||
"x16","x17","x18",
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
|
||||
"z8", "z9", "z10","z11","z12","z13","z14","z15",
|
||||
// "z16","z17","z18","z19","z20","z21","z22","z23",
|
||||
// "z24","z25","z26","z27","z28","z29","z30","z31",
|
||||
"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
|
||||
);
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
191
kernels/armsve/3/armsve_asm_2vx10.h
Normal file
191
kernels/armsve/3/armsve_asm_2vx10.h
Normal file
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
|
||||
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
|
||||
GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
|
||||
GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
|
||||
GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
|
||||
GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
|
||||
GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
|
||||
\
|
||||
GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
|
||||
GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
|
||||
|
||||
// Second through forth microkernels are the first one with B vectors rotated.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
|
||||
// NOTE:
|
||||
// The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
|
||||
// (sth. akin to loop-invariant):
|
||||
// - BV[0-7] holds B[0:7, 4*k_cur]
|
||||
// - B's address stops at B[0, 4*k_cur+1]
|
||||
|
||||
// Final loop inside K=4 microkernels.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
|
||||
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
|
||||
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
|
||||
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
|
||||
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
|
||||
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
|
||||
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
|
||||
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
|
||||
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
|
||||
|
||||
// K=4 MKer loop with B memory scattered.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
" mov "#BELMADDR", "#BADDR" \n\t" \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
|
||||
\
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
" mov "#BELMADDR", "#BADDR" \n\t" \
|
||||
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
|
||||
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
|
||||
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
|
||||
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
|
||||
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
|
||||
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
|
||||
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
|
||||
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
|
||||
|
||||
|
||||
#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
|
||||
CLEAR_COL4(Z00,Z01,Z02,Z03) \
|
||||
CLEAR_COL4(Z04,Z05,Z06,Z07) \
|
||||
CLEAR_COL4(Z08,Z09,Z10,Z11) \
|
||||
CLEAR_COL4(Z12,Z13,Z14,Z15) \
|
||||
CLEAR_COL4(Z16,Z17,Z18,Z19)
|
||||
|
||||
#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
|
||||
SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
|
||||
SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
|
||||
SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
|
||||
SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
|
||||
SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
|
||||
|
||||
#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
|
||||
|
||||
#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
123
kernels/armsve/3/armsve_asm_macros.h
Normal file
123
kernels/armsve/3/armsve_asm_macros.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#define CLEAR_COL2(Z0,Z1) \
|
||||
" dup "#Z0"."DT", #0 \n\t" \
|
||||
" dup "#Z1"."DT", #0 \n\t"
|
||||
|
||||
#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
|
||||
CLEAR_COL2(Z0,Z1) \
|
||||
CLEAR_COL2(Z2,Z3)
|
||||
|
||||
#define SCALE_COL2(Z0,Z1,ZFACTOR) \
|
||||
" fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
" fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
|
||||
#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
|
||||
SCALE_COL2(Z0,Z1,ZFACTOR) \
|
||||
SCALE_COL2(Z2,Z3,ZFACTOR)
|
||||
|
||||
// Prefetch or not.
|
||||
#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
|
||||
#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
|
||||
" prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
|
||||
|
||||
#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
|
||||
" fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
|
||||
|
||||
#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
|
||||
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
|
||||
|
||||
#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
|
||||
" add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
|
||||
|
||||
#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
|
||||
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \
|
||||
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
|
||||
|
||||
#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
|
||||
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
|
||||
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
|
||||
|
||||
// Prefetch or not.
|
||||
#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
|
||||
#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
|
||||
" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
|
||||
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
|
||||
" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
|
||||
|
||||
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
|
||||
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
|
||||
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
|
||||
PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
|
||||
|
||||
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
|
||||
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
|
||||
GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
|
||||
" add "#ATEMP", "#AADDR", "#APS" \n\t" \
|
||||
GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
|
||||
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
|
||||
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
|
||||
|
||||
#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
|
||||
|
||||
#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
|
||||
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \
|
||||
" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
|
||||
|
||||
#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
|
||||
" fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
|
||||
" fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
|
||||
|
||||
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_double.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_double.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use double precision.
|
||||
#define DT "d"
|
||||
#define LD1 "ld1d"
|
||||
#define ST1 "st1d"
|
||||
#define LD1R "ld1rd"
|
||||
#define PRFG "prfd"
|
||||
#define SZ "8"
|
||||
#define OFFS "lsl #3"
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_half.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_half.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use half precision.
|
||||
#define DT "h"
|
||||
#define LD1 "ld1h"
|
||||
#define ST1 "st1h"
|
||||
#define LD1R "ld1rh"
|
||||
#define PRFG "prfh"
|
||||
#define SZ "2"
|
||||
// #define OFFS UNSUPPORTED
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_single.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_single.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use single precision.
|
||||
#define DT "s"
|
||||
#define LD1 "ld1w"
|
||||
#define ST1 "st1w"
|
||||
#define LD1R "ld1rw"
|
||||
#define PRFG "prfw"
|
||||
#define SZ "4"
|
||||
#define OFFS "uxtw #2"
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
318
kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
Normal file
318
kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
Normal file
@@ -0,0 +1,318 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
void bli_dgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" incd x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.d \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
|
||||
" ld1rd z28.d, p0/z, [x1, 64] \n\t"
|
||||
" ld1rd z29.d, p0/z, [x1, 72] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ldr x4, [x4] \n\t" // Load alpha & beta (value).
|
||||
" ldr x8, [x8] \n\t"
|
||||
" dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.d, x8 \n\t"
|
||||
" fmov d28, #1.0 \n\t" // Prepare FP 1.0.
|
||||
" fmov x16, d28 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" prfm PLDL1STRM, [x0] \n\t"
|
||||
" prfm PLDL1STRM, [x0, 256*1] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL1STRM, [x1] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 256*1] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t" // Preload first half of C for contiguous case.
|
||||
" b.ne WRITE_MEM \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" cmp x16, x4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
// First half of C is already loaded in this case.
|
||||
GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
307
kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
Normal file
307
kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
Normal file
@@ -0,0 +1,307 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Single-precision composite instructions.
|
||||
#include "armsve_asm_macros_single.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
void bli_sgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" incw x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #4 \n\t" // Multiply some address skips by sizeof(float).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.s \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
|
||||
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
|
||||
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
|
||||
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
|
||||
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
|
||||
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
|
||||
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
|
||||
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
|
||||
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
|
||||
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
|
||||
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
|
||||
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
|
||||
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
|
||||
" ld1rw z28.s, p0/z, [x1, 32] \n\t"
|
||||
" ld1rw z29.s, p0/z, [x1, 36] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ldr w4, [x4] \n\t" // Load alpha & beta (value).
|
||||
" ldr w8, [x8] \n\t"
|
||||
" dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.s, w8 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x0] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov s28, #1.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
343
kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
Normal file
343
kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
Normal file
@@ -0,0 +1,343 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Half-precision composite instructions.
|
||||
#include "armsve_asm_macros_half.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
// Gather-load / scatter-store instruction for half-precision
|
||||
// needs being defined separately.
|
||||
#undef GEMM_CCOL_GATHER_LOAD_FWD
|
||||
#undef GEMM_CCOL_SCATTER_STORE_FWD
|
||||
|
||||
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" add x28, "#CADDR", "#CRS2" \n\t" \
|
||||
" ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
|
||||
" fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" add x28, "#CTEMP", "#CRS2" \n\t" \
|
||||
" ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
|
||||
" fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" add x28, "#CADDR", "#CRS2" \n\t" \
|
||||
" st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
|
||||
" st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" add x28, "#CTEMP", "#CRS2" \n\t" \
|
||||
" st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
|
||||
" st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
|
||||
void bli_shgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
void* restrict alpha,
|
||||
void* restrict a,
|
||||
void* restrict b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" inch x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.b \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
|
||||
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
|
||||
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
|
||||
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
|
||||
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
|
||||
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
|
||||
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
|
||||
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
|
||||
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
|
||||
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
|
||||
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
|
||||
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
|
||||
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
|
||||
" ld1rh z28.h, p0/z, [x1, 16] \n\t"
|
||||
" ld1rh z29.h, p0/z, [x1, 18] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors.
|
||||
" ld1rh z31.h, p0/z, [x8] \n\t"
|
||||
" fmov w4, h28 \n\t" // Copy alpha & beta to GP registers.
|
||||
" fmov w8, h29 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x0] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov h28, #1.0 \n\t"
|
||||
" fmov w16, h28 \n\t"
|
||||
" cmp w16, w4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x10, xzr \n\t"
|
||||
" incb x10 \n\t"
|
||||
" madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" mov x28, #2 \n\t"
|
||||
" madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case.
|
||||
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
|
||||
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
|
||||
" \n\t"
|
||||
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16","x10","x28",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
450
kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
Normal file
450
kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
Normal file
@@ -0,0 +1,450 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Separate instantiation for ArmSVE reference kernels.
|
||||
// Temporary workaround. Will be removed after upstream has switched to a better way
|
||||
// of exposing gemmsup interface.
|
||||
|
||||
//
|
||||
// -- Row storage case ---------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
|
||||
|
||||
//
|
||||
// -- Column storage case ------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )
|
||||
|
||||
@@ -0,0 +1,528 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "../armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "../armsve_asm_2vx10.h"
|
||||
|
||||
// Prototype reference kernel.
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 )
|
||||
|
||||
void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
static int called = 0;
|
||||
if ( !called )
|
||||
{
|
||||
fprintf(stderr, "rv called.\n");
|
||||
called = 1;
|
||||
}
|
||||
// c*c requires A to be stored in columns.
|
||||
assert( rs_a0 == 1 );
|
||||
|
||||
dim_t n0_mker = n0 / 10;
|
||||
dim_t n0_left = n0 % 10;
|
||||
|
||||
if ( n0_left )
|
||||
{
|
||||
// A[:, ::]
|
||||
// B[::, n0_mker*10:n0]
|
||||
// C[: , n0_mker*10:n0]
|
||||
double *ai = a;
|
||||
double *bi = b + n0_mker * 10 * cs_b0;
|
||||
double *ci = c + n0_mker * 10 * cs_c0;
|
||||
bli_dgemmsup_c_armsve_ref2
|
||||
(
|
||||
conja, conjb,
|
||||
m0, n0_left, k0,
|
||||
alpha,
|
||||
ai, rs_a0, cs_a0,
|
||||
bi, rs_b0, cs_b0,
|
||||
beta,
|
||||
ci, rs_c0, cs_c0,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Return if it's a pure edge case.
|
||||
if ( !n0_mker )
|
||||
return;
|
||||
|
||||
// Determine VL.
|
||||
uint64_t vlen2;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incd x0, ALL, MUL #2 \n\t"
|
||||
" mov %[vlen2], x0 \n\t"
|
||||
: [vlen2] "=r" (vlen2)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t rs_a = 1;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t cs_b = cs_b0;
|
||||
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t n_mker = n0_mker;
|
||||
|
||||
dim_t m0_mker = m0 / vlen2;
|
||||
dim_t m0_left = m0 % vlen2;
|
||||
if ( m0_left )
|
||||
{
|
||||
// Edge case on A side can be handled with one more (predicated) loop.
|
||||
m0_mker++;
|
||||
} else
|
||||
m0_left = vlen2;
|
||||
// uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
|
||||
for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
|
||||
{
|
||||
uint64_t m_curr = vlen2;
|
||||
if ( im0_mker == m0_mker - 1 )
|
||||
{
|
||||
// Last m-loop. Maybe unnecessary.
|
||||
m_curr = m0_left;
|
||||
}
|
||||
double *ai = a + im0_mker * vlen2 * rs_a0;
|
||||
double *bi = b;
|
||||
double *ci = c + im0_mker * vlen2 * rs_c0;
|
||||
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[bi] \n\t"
|
||||
" ldr x1, %[rs_b] \n\t" // Row-skip of B.
|
||||
" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]).
|
||||
" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B.
|
||||
" ldr x4, %[cs_a] \n\t" // Column-Skip of A.
|
||||
" \n\t" // Element skip of A[:, l] is guaranteed to be 1.
|
||||
" ldr x5, %[ci] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag C address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x5, x5, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x1, x8, x1, xzr \n\t" // rs_b
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_b
|
||||
" madd x3, x8, x3, xzr \n\t" // ps_b
|
||||
" madd x4, x8, x4, xzr \n\t" // cs_a
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" mov x8, #4 \n\t"
|
||||
" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A.
|
||||
" \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x20 \n\t" // Higher 6bit for Control#2:
|
||||
" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
|
||||
" orr x16, x16, x4 \n\t" // Stride.
|
||||
" msr S3_3_C11_C6_2, x16 \n\t" // Write system register.
|
||||
#endif
|
||||
" \n\t"
|
||||
" ldr x8, %[m_curr] \n\t" // Size of first dimension.
|
||||
" mov x9, xzr \n\t"
|
||||
" incd x9 \n\t"
|
||||
" ptrue p0.d \n\t"
|
||||
" whilelo p1.d, xzr, x8 \n\t"
|
||||
" whilelo p2.d, x9, x8 \n\t"
|
||||
" \n\t"
|
||||
" ldr x8, %[n_mker] \n\t" // Number of N-loops.
|
||||
" \n\t"
|
||||
" ldr x20, %[ai] \n\t" // Parameters to be reloaded
|
||||
" ldr x21, %[k_mker] \n\t" // within each millikernel loop.
|
||||
" ldr x22, %[k_left] \n\t"
|
||||
" ldr x23, %[alpha] \n\t"
|
||||
" ldr x24, %[beta] \n\t"
|
||||
" ldr x25, %[a_next] \n\t"
|
||||
" ldr x26, %[b_next] \n\t"
|
||||
" ldr x23, [x23] \n\t" // Directly load alpha and beta.
|
||||
" ldr x24, [x24] \n\t"
|
||||
" \n\t"
|
||||
" MILLIKER_MLOOP: \n\t"
|
||||
" \n\t"
|
||||
" mov x11, x0 \n\t" // B's address.
|
||||
// " ldr x10, %[ai] \n\t" // A's address.
|
||||
" mov x10, x20 \n\t"
|
||||
// " ldr x12, %[k_mker] \n\t"
|
||||
" mov x12, x21 \n\t"
|
||||
// " ldr x13, %[k_left] \n\t"
|
||||
" mov x13, x22 \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x3 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x10, x10, x16 \n\t"
|
||||
" mov x16, 0xa \n\t" // Control#2 for A address.
|
||||
" lsl x16, x16, #60 \n\t"
|
||||
" orr x10, x10, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
" \n\t"
|
||||
" mov x14, x11 \n\t"
|
||||
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z21.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z22.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z23.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z24.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z25.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z26.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z27.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x14] \n\t"
|
||||
" sub x14, x14, x2 \n\t" // Restore x14 to load edge.
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A.
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" add x10, x10, x4 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x13, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
|
||||
" mov x14, x11 \n\t"
|
||||
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z21.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z22.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z23.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z24.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z25.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z26.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z27.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z28.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z29.d, p0/z, [x14] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x10, x10, x4 \n\t" // Forward A.
|
||||
" add x11, x11, x1 \n\t" // Forward B.
|
||||
" sub x13, x13, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
// " ldr x10, %[ai] \n\t"
|
||||
" mov x10, x20 \n\t"
|
||||
" add x11, x0, x3 \n\t"
|
||||
" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.d, x24 \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.eq PREFETCH_ABNEXT \n\t"
|
||||
" prfm PLDL1STRM, [x10] \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" b WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
|
||||
" mov x1, x25 \n\t"
|
||||
// " ldr x2, %[b_next] \n\t"
|
||||
" mov x2, x26 \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov d28, #1.0 \n\t"
|
||||
" fmov x16, d28 \n\t"
|
||||
" cmp x16, x23 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
|
||||
" incb x13 \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x12, xzr \n\t"
|
||||
" incb x12 \n\t"
|
||||
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" subs x8, x8, #1 \n\t"
|
||||
" b.eq END_EXEC \n\t"
|
||||
" \n\t" // Address of C already forwarded to next column.
|
||||
" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel.
|
||||
" b MILLIKER_MLOOP \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [bi] "m" (bi),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b] "m" (ps_b),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ci] "m" (ci),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[m_curr] "m" (m_curr),
|
||||
[n_mker] "m" (n_mker),
|
||||
[ai] "m" (ai),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x10","x11","x12","x13","x14","x15","x16","x17",
|
||||
"x20","x21","x22","x23","x24","x25","x26",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemmsup_rv_armsve_10x2v_unindexed
|
||||
(
|
||||
conj_t conjat,
|
||||
conj_t conjbt,
|
||||
dim_t m0t,
|
||||
dim_t n0t,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict at, inc_t rs_at0, inc_t cs_at0,
|
||||
double* restrict bt, inc_t rs_bt0, inc_t cs_bt0,
|
||||
double* restrict beta,
|
||||
double* restrict ct, inc_t rs_ct0, inc_t cs_ct0,
|
||||
auxinfo_t* restrict datat,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
auxinfo_t data;
|
||||
bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
|
||||
bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
|
||||
bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
|
||||
bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
|
||||
bli_dgemmsup_cv_armsve_2vx10_unindexed
|
||||
(
|
||||
conjbt, conjat,
|
||||
n0t, m0t, k0,
|
||||
alpha,
|
||||
bt, cs_bt0, rs_bt0,
|
||||
at, cs_at0, rs_at0,
|
||||
beta,
|
||||
ct, cs_ct0, rs_ct0,
|
||||
&data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,412 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "../armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "../armsve_asm_2vx10.h"
|
||||
|
||||
// Prototype reference kernel.
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 )
|
||||
|
||||
void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
static int called = 0;
|
||||
if ( !called )
|
||||
{
|
||||
fprintf(stderr, "rv called.\n");
|
||||
called = 1;
|
||||
}
|
||||
// r*r requires B to be stored in rows.
|
||||
assert(cs_b0 == 1);
|
||||
|
||||
dim_t n0_mker = n0 / 10;
|
||||
dim_t n0_left = n0 % 10;
|
||||
|
||||
if ( n0_left )
|
||||
{
|
||||
// A[:, ::]
|
||||
// B[::, n0_mker*10:n0]
|
||||
// C[: , n0_mker*10:n0]
|
||||
double *ai = a;
|
||||
double *bi = b + n0_mker * 10 * cs_b0;
|
||||
double *ci = c + n0_mker * 10 * cs_c0;
|
||||
bli_dgemmsup_r_armsve_ref2
|
||||
(
|
||||
conja, conjb,
|
||||
m0, n0_left, k0,
|
||||
alpha,
|
||||
ai, rs_a0, cs_a0,
|
||||
bi, rs_b0, cs_b0,
|
||||
beta,
|
||||
ci, rs_c0, cs_c0,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Return if it's a pure edge case.
|
||||
if ( !n0_mker )
|
||||
return;
|
||||
|
||||
// Determine VL.
|
||||
uint64_t vlen2;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incd x0, ALL, MUL #2 \n\t"
|
||||
" mov %[vlen2], x0 \n\t"
|
||||
: [vlen2] "=r" (vlen2)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
// uint64_t cs_b = 1;
|
||||
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t m_mker = m0 / vlen2;
|
||||
uint64_t m_left = m0 % vlen2;
|
||||
if ( m_left )
|
||||
{
|
||||
// Edge case on A side can be handled with one more (predicated) loop.
|
||||
m_mker++;
|
||||
} else
|
||||
m_left = vlen2;
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
// uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
|
||||
for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
|
||||
{
|
||||
double *ai = a;
|
||||
double *bi = b + in0_mker * 10 * cs_b0;
|
||||
double *ci = c + in0_mker * 10 * cs_c0;
|
||||
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[ai] \n\t"
|
||||
" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]).
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A.
|
||||
" ldr x4, %[rs_b] \n\t" // Row-Skip of B.
|
||||
" \n\t" // Element skip of B[l, :] is guaranteed to be 1.
|
||||
" ldr x5, %[ci] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag C address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x5, x5, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // ps_a
|
||||
" madd x4, x8, x4, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip.
|
||||
" mov x8, #4 \n\t"
|
||||
" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A.
|
||||
// " mov x8, #4 \n\t"
|
||||
// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B.
|
||||
" \n\t"
|
||||
" ldr x8, %[m_mker] \n\t" // Number of M-loops.
|
||||
" ptrue p0.d \n\t"
|
||||
" ptrue p1.d \n\t"
|
||||
" ptrue p2.d \n\t"
|
||||
" \n\t"
|
||||
" MILLIKER_MLOOP: \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.ne UKER_BEGIN \n\t"
|
||||
" \n\t"
|
||||
" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop.
|
||||
" mov x11, xzr \n\t"
|
||||
" incd x11 \n\t"
|
||||
" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2.
|
||||
" whilelo p2.d, x11, x10 \n\t"
|
||||
" \n\t"
|
||||
" UKER_BEGIN: \n\t"
|
||||
" mov x10, x0 \n\t" // A's address.
|
||||
" ldr x11, %[bi] \n\t" // B's address.
|
||||
" ldr x12, %[k_mker] \n\t"
|
||||
" ldr x13, %[k_left] \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x3 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x11, x11, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x16, x11 \n\t" // Prefetch first kernel of B.
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" \n\t"
|
||||
" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row.
|
||||
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t" // First A column.
|
||||
" \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t" // Unroll the 4-loop.
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" add x10, x10, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x13, #0 \n\t"
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
|
||||
" ld1rd z20.d, p0/z, [x11] \n\t"
|
||||
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
|
||||
" ld1rd z28.d, p0/z, [x11, #64] \n\t"
|
||||
" ld1rd z29.d, p0/z, [x11, #72] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x10, x10, x2 \n\t" // Forward A.
|
||||
" add x11, x11, x4 \n\t" // Forward B.
|
||||
" sub x13, x13, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x11, %[bi] \n\t"
|
||||
" ldr x12, %[alpha] \n\t" // Load alpha & beta.
|
||||
" ldr x13, %[beta] \n\t"
|
||||
" ld1rd z30.d, p0/z, [x12] \n\t"
|
||||
" ld1rd z31.d, p0/z, [x13] \n\t"
|
||||
" ldr x12, [x12] \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.eq PREFETCH_ABNEXT \n\t"
|
||||
" prfm PLDL2STRM, [x11] \n\t"
|
||||
" b WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
|
||||
" ldr x2, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov d28, #1.0 \n\t"
|
||||
" fmov x16, d28 \n\t"
|
||||
" cmp x16, x12 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" mov x10, x5 \n\t" // C address for storing.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
|
||||
" incb x13 \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x12, xzr \n\t"
|
||||
" incb x12 \n\t"
|
||||
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" subs x8, x8, #1 \n\t"
|
||||
" b.eq END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel.
|
||||
" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x5, x5, x13 \n\t"
|
||||
" b MILLIKER_MLOOP \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [ai] "m" (ai),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a] "m" (ps_a),
|
||||
[rs_b] "m" (rs_b),
|
||||
[ci] "m" (ci),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[m_mker] "m" (m_mker),
|
||||
[m_left] "m" (m_left),
|
||||
[bi] "m" (bi),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,5 +33,13 @@
|
||||
*/
|
||||
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed )
|
||||
|
||||
PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_12xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk )
|
||||
|
||||
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk
|
||||
|
||||
mov(var(kappa), rcx) // load address of kappa
|
||||
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
|
||||
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
|
||||
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
|
||||
|
||||
|
||||
// now branch on kappa == 1.0
|
||||
|
||||
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk
|
||||
|
||||
mov(var(kappa), rcx) // load address of kappa
|
||||
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
|
||||
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
|
||||
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
|
||||
|
||||
|
||||
// now branch on kappa == 1.0
|
||||
|
||||
88
sandbox/gemmlike/bli_gemmnat.c
Normal file
88
sandbox/gemmlike/bli_gemmnat.c
Normal file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
|
||||
// entry point to any sandbox implementation.
|
||||
|
||||
// NOTE: This function is implemented identically to the function that it
|
||||
// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
|
||||
// forgoing the option of customizing the implementations that underlie
|
||||
// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
|
||||
// directory, however, will be included in the BLIS.
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, cname, imeth ) \
|
||||
\
|
||||
void PASTEMAC(opname,imeth) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm \
|
||||
) \
|
||||
{ \
|
||||
\
|
||||
/* A switch to easily toggle whether we use the sandbox implementation
|
||||
of bls_gemm() as the implementation for bli_gemm(). (This allows for
|
||||
easy testing of bls_gemm() via the testsuite.) */ \
|
||||
if ( 1 ) \
|
||||
{ \
|
||||
bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
bli_init_once(); \
|
||||
\
|
||||
/* Obtain a valid (native) context from the gks if necessary. */ \
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
( \
|
||||
alpha, a, b, beta, c, cntx, rntm, NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
GENFRONT( gemm, gemm, nat )
|
||||
56
sandbox/gemmlike/bli_sandbox.h
Normal file
56
sandbox/gemmlike/bli_sandbox.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of copyright holder(s) nor the names
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SANDBOX_H
|
||||
#define BLIS_SANDBOX_H
|
||||
|
||||
// NOTE: This header is the only header required to be present in the sandbox
|
||||
// implementation directory.
|
||||
|
||||
// This header should contain (or #include) any definitions that must be
|
||||
// folded into blis.h. Typically, it will remain empty since any header
|
||||
// definitions specific to the sandbox implementation will not need to be
|
||||
// made available to applications (or the framework) during compilation.
|
||||
|
||||
#include "bls_gemm.h"
|
||||
#include "bls_gemm_var.h"
|
||||
|
||||
#include "bls_l3_packm_a.h"
|
||||
#include "bls_l3_packm_b.h"
|
||||
#include "bls_l3_packm_var.h"
|
||||
|
||||
#include "bls_l3_decor.h"
|
||||
|
||||
|
||||
#endif
|
||||
304
sandbox/gemmlike/bls_gemm.c
Normal file
304
sandbox/gemmlike/bls_gemm.c
Normal file
@@ -0,0 +1,304 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// -- Define the gemm-like operation's object API ------------------------------
|
||||
//
|
||||
|
||||
void bls_gemm
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c
|
||||
)
|
||||
{
|
||||
bls_gemm_ex
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
void bls_gemm_ex
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
// -- bli_gemmnat() --------------------------------------------------------
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
// -- bli_gemm_front() -----------------------------------------------------
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
{
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
}
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
|
||||
// and return early.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
|
||||
bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Induce a transposition of A if it has its transposition property set.
|
||||
// Then clear the transposition bit in the object.
|
||||
if ( bli_obj_has_trans( &a_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
|
||||
}
|
||||
|
||||
// Induce a transposition of B if it has its transposition property set.
|
||||
// Then clear the transposition bit in the object.
|
||||
if ( bli_obj_has_trans( &b_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
|
||||
}
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
|
||||
// NOTE: This is probably not needed within the sandbox.
|
||||
// We must also swap the pack schemas, which were set by bli_gemm_md()
|
||||
// or the inlined code above.
|
||||
//bli_obj_swap_pack_schemas( &a_local, &b_local );
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
|
||||
// point function for each thread. This also begins the process of creating
|
||||
// the thrinfo_t tree, which contains thread communicators.
|
||||
bls_l3_thread_decorator
|
||||
(
|
||||
bls_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- Define the gemm-like operation's thread entry point ----------------------
|
||||
//
|
||||
|
||||
void bls_gemm_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// In this function, we choose the gemm implementation that is executed
|
||||
// on each thread.
|
||||
|
||||
#if 1
|
||||
// Call the block-panel algorithm that calls the kernel directly, which
|
||||
// exposes edge-case handling.
|
||||
bls_gemm_bp_var1
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
#else
|
||||
// Call the block-panel algorithm that calls the kernel indirectly via a
|
||||
// wrapper function, which hides edge-case handling.
|
||||
bls_gemm_bp_var2
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// -- Define the gemm-like operation's typed API -------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
|
||||
the macro parameter 'ch' (e.g. s, d, etc). */ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
/* Adjust the dimensions of matrices A and B according to the transa and
|
||||
transb parameters. */ \
|
||||
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
|
||||
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
|
||||
\
|
||||
/* Create bufferless scalar objects and attach the provided scalar pointers
|
||||
to those scalar objects. */ \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
/* Create bufferless matrix objects and attach the provided matrix pointers
|
||||
to those matrix objects. */ \
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
/* Set the transposition/conjugation properties of the objects for matrices
|
||||
A and B. */ \
|
||||
bli_obj_set_conjtrans( transa, &ao ); \
|
||||
bli_obj_set_conjtrans( transb, &bo ); \
|
||||
\
|
||||
/* Call the object interface. */ \
|
||||
PASTECH(bls_,opname) \
|
||||
( \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co \
|
||||
); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemm )
|
||||
GENTFUNC( float, s, gemm )
|
||||
GENTFUNC( double, d, gemm )
|
||||
GENTFUNC( scomplex, c, gemm )
|
||||
GENTFUNC( dcomplex, z, gemm )
|
||||
|
||||
101
sandbox/gemmlike/bls_gemm.h
Normal file
101
sandbox/gemmlike/bls_gemm.h
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// -- Prototype the gemm-like operation's object API ---------------------------
|
||||
//
|
||||
|
||||
void bls_gemm
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c
|
||||
);
|
||||
|
||||
void bls_gemm_ex
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
//
|
||||
// -- Prototype the gemm-like operation's thread entry point -------------------
|
||||
//
|
||||
|
||||
void bls_gemm_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- Prototype the gemm-like operation's typed API ----------------------------
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemm )
|
||||
GENTPROT( float, s, gemm )
|
||||
GENTPROT( double, d, gemm )
|
||||
GENTPROT( scomplex, c, gemm )
|
||||
GENTPROT( dcomplex, z, gemm )
|
||||
|
||||
521
sandbox/gemmlike/bls_gemm_bp_var1.c
Normal file
521
sandbox/gemmlike/bls_gemm_bp_var1.c
Normal file
@@ -0,0 +1,521 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* restrict cntx,
|
||||
rntm_t* restrict rntm,
|
||||
thrinfo_t* restrict thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- gemm-like block-panel algorithm (object interface) -----------------------
|
||||
//
|
||||
|
||||
// Define a function pointer array named ftypes and initialize its contents with
|
||||
// the addresses of the typed functions defined below, bls_?gemm_bp_var1().
|
||||
static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1);
|
||||
|
||||
void bls_gemm_bp_var1
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
const inc_t rs_a = bli_obj_row_stride( a );
|
||||
const inc_t cs_a = bli_obj_col_stride( a );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
const inc_t rs_b = bli_obj_row_stride( b );
|
||||
const inc_t cs_b = bli_obj_col_stride( b );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
|
||||
|
||||
// Index into the function pointer array to extract the correct
|
||||
// typed function pointer based on the chosen datatype.
|
||||
FUNCPTR_T f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- gemm-like block-panel algorithm (typed interface) ------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c; \
|
||||
const inc_t jcstep_b = cs_b; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a; \
|
||||
const inc_t pcstep_b = rs_b; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c; \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
const inc_t irstep_c = rs_c * MR; \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
/* Make local copies of the scalars to prevent any unnecessary sharing of
|
||||
cache lines between the cores' caches. */ \
|
||||
ctype alpha_local = *alpha_cast; \
|
||||
ctype beta_local = *beta_cast; \
|
||||
ctype one_local = *PASTEMAC(ch,1); \
|
||||
ctype zero_local = *PASTEMAC(ch,0); \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. */ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
|
||||
BLIS_KC, /* 4th loop */ \
|
||||
BLIS_NO_PART, /* pack B */ \
|
||||
BLIS_MC, /* 3rd loop */ \
|
||||
BLIS_NO_PART, /* pack A */ \
|
||||
BLIS_NR, /* 2nd loop */ \
|
||||
BLIS_MR, /* 1st loop */ \
|
||||
BLIS_KR }; /* microkernel loop */ \
|
||||
\
|
||||
bszid_t* restrict bszids_jc = &bszids[0]; \
|
||||
bszid_t* restrict bszids_pc = &bszids[1]; \
|
||||
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
|
||||
bszid_t* restrict bszids_ic = &bszids[3]; \
|
||||
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
|
||||
bszid_t* restrict bszids_jr = &bszids[5]; \
|
||||
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
thrinfo_t* restrict thread_ir = NULL; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current JC block dimension. */ \
|
||||
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
|
||||
\
|
||||
/* Compute the PC loop thread range for the current thread. */ \
|
||||
const dim_t pc_start = 0, pc_end = k; \
|
||||
const dim_t k_local = k; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the PC loop. */ \
|
||||
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
|
||||
const dim_t pc_left = k_local % KC; \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current PC block dimension. */ \
|
||||
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. Then call the packm implementation. */ \
|
||||
PASTECH2(bls_,ch,packm_b) \
|
||||
( \
|
||||
conjb, \
|
||||
KC, NC, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current IC block dimension. */ \
|
||||
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. Then call the packm implementation. */ \
|
||||
PASTECH2(bls_,ch,packm_a) \
|
||||
( \
|
||||
conja, \
|
||||
MC, KC, \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the JR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of B. */ \
|
||||
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
|
||||
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
|
||||
{ \
|
||||
const dim_t nr_cur \
|
||||
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Assume for now that our next panel of B to be the current panel
|
||||
of B. */ \
|
||||
ctype* restrict b2 = b_jr; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. */ \
|
||||
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the IR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of A. */ \
|
||||
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
|
||||
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IR loop. */ \
|
||||
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||
dim_t ir_left = mc_cur % MR; \
|
||||
\
|
||||
/* Compute the IR loop thread range for the current thread. */ \
|
||||
dim_t ir_start, ir_end; \
|
||||
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
|
||||
{ \
|
||||
const dim_t mr_cur \
|
||||
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
|
||||
\
|
||||
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
|
||||
ctype* restrict c_ir = c_jr + i * irstep_c; \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next micropanels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_ic_use; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_pc_use; \
|
||||
} \
|
||||
\
|
||||
/* Save the addresses of next micropanels of A and B to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( mr_cur == MR && nr_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, \
|
||||
b_jr, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, \
|
||||
b_jr, \
|
||||
&zero_local, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn) \
|
||||
( \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* This barrier is needed to prevent threads from starting to pack
|
||||
the next row panel of B before the current row panel is fully
|
||||
computed upon. */ \
|
||||
bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||
PASTECH2(bls_,ch,packm_finalize_mem_a) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
PASTECH2(bls_,ch,packm_finalize_mem_b) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemm_bp_var1 )
|
||||
GENTFUNC( float, s, gemm_bp_var1 )
|
||||
GENTFUNC( double, d, gemm_bp_var1 )
|
||||
GENTFUNC( scomplex, c, gemm_bp_var1 )
|
||||
GENTFUNC( dcomplex, z, gemm_bp_var1 )
|
||||
|
||||
596
sandbox/gemmlike/bls_gemm_bp_var2.c
Normal file
596
sandbox/gemmlike/bls_gemm_bp_var2.c
Normal file
@@ -0,0 +1,596 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* restrict cntx,
|
||||
rntm_t* restrict rntm,
|
||||
thrinfo_t* restrict thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- gemm-like block-panel algorithm (object interface) -----------------------
|
||||
//
|
||||
|
||||
// Define a function pointer array named ftypes and initialize its contents with
|
||||
// the addresses of the typed functions defined below, bls_?gemm_bp_var2().
|
||||
static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var2);
|
||||
|
||||
void bls_gemm_bp_var2
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
const inc_t rs_a = bli_obj_row_stride( a );
|
||||
const inc_t cs_a = bli_obj_col_stride( a );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
const inc_t rs_b = bli_obj_row_stride( b );
|
||||
const inc_t cs_b = bli_obj_col_stride( b );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
|
||||
|
||||
// Index into the function pointer array to extract the correct
|
||||
// typed function pointer based on the chosen datatype.
|
||||
FUNCPTR_T f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- gemm-like block-panel algorithm (typed interface) ------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
/*
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
*/ \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
/*
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
*/ \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c; \
|
||||
const inc_t jcstep_b = cs_b; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a; \
|
||||
const inc_t pcstep_b = rs_b; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c; \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
const inc_t irstep_c = rs_c * MR; \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
/* Make local copies of the scalars to prevent any unnecessary sharing of
|
||||
cache lines between the cores' caches. */ \
|
||||
ctype alpha_local = *alpha_cast; \
|
||||
ctype beta_local = *beta_cast; \
|
||||
ctype one_local = *PASTEMAC(ch,1); \
|
||||
/*ctype zero_local = *PASTEMAC(ch,0);*/ \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. */ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
|
||||
BLIS_KC, /* 4th loop */ \
|
||||
BLIS_NO_PART, /* pack B */ \
|
||||
BLIS_MC, /* 3rd loop */ \
|
||||
BLIS_NO_PART, /* pack A */ \
|
||||
BLIS_NR, /* 2nd loop */ \
|
||||
BLIS_MR, /* 1st loop */ \
|
||||
BLIS_KR }; /* microkernel loop */ \
|
||||
\
|
||||
bszid_t* restrict bszids_jc = &bszids[0]; \
|
||||
bszid_t* restrict bszids_pc = &bszids[1]; \
|
||||
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
|
||||
bszid_t* restrict bszids_ic = &bszids[3]; \
|
||||
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
|
||||
bszid_t* restrict bszids_jr = &bszids[5]; \
|
||||
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
thrinfo_t* restrict thread_ir = NULL; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current JC block dimension. */ \
|
||||
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
|
||||
\
|
||||
/* Compute the PC loop thread range for the current thread. */ \
|
||||
const dim_t pc_start = 0, pc_end = k; \
|
||||
const dim_t k_local = k; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the PC loop. */ \
|
||||
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
|
||||
const dim_t pc_left = k_local % KC; \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current PC block dimension. */ \
|
||||
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. Then call the packm implementation. */ \
|
||||
PASTECH2(bls_,ch,packm_b) \
|
||||
( \
|
||||
conjb, \
|
||||
KC, NC, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current IC block dimension. */ \
|
||||
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. Then call the packm implementation. */ \
|
||||
PASTECH2(bls_,ch,packm_a) \
|
||||
( \
|
||||
conja, \
|
||||
MC, KC, \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the JR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of B. */ \
|
||||
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
|
||||
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
|
||||
{ \
|
||||
const dim_t nr_cur \
|
||||
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Assume for now that our next panel of B to be the current panel
|
||||
of B. */ \
|
||||
ctype* restrict b2 = b_jr; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. */ \
|
||||
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the IR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of A. */ \
|
||||
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
|
||||
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IR loop. */ \
|
||||
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||
dim_t ir_left = mc_cur % MR; \
|
||||
\
|
||||
/* Compute the IR loop thread range for the current thread. */ \
|
||||
dim_t ir_start, ir_end; \
|
||||
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
|
||||
{ \
|
||||
const dim_t mr_cur \
|
||||
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
|
||||
\
|
||||
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
|
||||
ctype* restrict c_ir = c_jr + i * irstep_c; \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next micropanels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_ic_use; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_pc_use; \
|
||||
} \
|
||||
\
|
||||
/* Save the addresses of next micropanels of A and B to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Call a wrapper to the kernel (which handles edge cases). */ \
|
||||
PASTECH2(bls_,ch,gemm_kernel) \
|
||||
( \
|
||||
MR, \
|
||||
NR, \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, rs_a_use, cs_a_use, \
|
||||
b_jr, rs_b_use, cs_b_use, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* This barrier is needed to prevent threads from starting to pack
|
||||
the next row panel of B before the current row panel is fully
|
||||
computed upon. */ \
|
||||
bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||
PASTECH2(bls_,ch,packm_finalize_mem_a) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
PASTECH2(bls_,ch,packm_finalize_mem_b) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemm_bp_var2 )
|
||||
GENTFUNC( float, s, gemm_bp_var2 )
|
||||
GENTFUNC( double, d, gemm_bp_var2 )
|
||||
GENTFUNC( scomplex, c, gemm_bp_var2 )
|
||||
GENTFUNC( dcomplex, z, gemm_bp_var2 )
|
||||
|
||||
//
|
||||
// -- gemm-like microkernel wrapper --------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
const dim_t MR, \
|
||||
const dim_t NR, \
|
||||
dim_t mr_cur, \
|
||||
dim_t nr_cur, \
|
||||
dim_t kc_cur, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict aux, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* Infer the datatype from the ctype. */ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype zero = *PASTEMAC(ch,0); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs.
|
||||
NOTE: This initialization should really be done statically since
|
||||
var2 executes this microkernel wrapper many times, and the overhead
|
||||
of touching the temporary microtile adds up. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( mr_cur == MR && nr_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, rs_c, cs_c, \
|
||||
aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
&zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn) \
|
||||
( \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta, \
|
||||
c, rs_c, cs_c \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemm_kernel )
|
||||
GENTFUNC( float, s, gemm_kernel )
|
||||
GENTFUNC( double, d, gemm_kernel )
|
||||
GENTFUNC( scomplex, c, gemm_kernel )
|
||||
GENTFUNC( dcomplex, z, gemm_kernel )
|
||||
|
||||
124
sandbox/gemmlike/bls_gemm_var.h
Normal file
124
sandbox/gemmlike/bls_gemm_var.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype the object-based variant interfaces.
|
||||
//
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTECH(bls_,opname) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENPROT( gemm_bp_var1 )
|
||||
GENPROT( gemm_bp_var2 )
|
||||
|
||||
|
||||
//
|
||||
// Prototype the typed variant interfaces.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemm_bp_var1 )
|
||||
GENTPROT( float, s, gemm_bp_var1 )
|
||||
GENTPROT( double, d, gemm_bp_var1 )
|
||||
GENTPROT( scomplex, c, gemm_bp_var1 )
|
||||
GENTPROT( dcomplex, z, gemm_bp_var1 )
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemm_bp_var2 )
|
||||
GENTPROT( float, s, gemm_bp_var2 )
|
||||
GENTPROT( double, d, gemm_bp_var2 )
|
||||
GENTPROT( scomplex, c, gemm_bp_var2 )
|
||||
GENTPROT( dcomplex, z, gemm_bp_var2 )
|
||||
|
||||
|
||||
//
|
||||
// Prototype the typed kernel interfaces.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
const dim_t MR, \
|
||||
const dim_t NR, \
|
||||
dim_t mr_cur, \
|
||||
dim_t nr_cur, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict aux, \
|
||||
cntx_t* restrict cntx \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemm_kernel )
|
||||
GENTPROT( float, s, gemm_kernel )
|
||||
GENTPROT( double, d, gemm_kernel )
|
||||
GENTPROT( scomplex, c, gemm_kernel )
|
||||
GENTPROT( dcomplex, z, gemm_kernel )
|
||||
|
||||
328
sandbox/gemmlike/bls_l3_packm_a.c
Normal file
328
sandbox/gemmlike/bls_l3_packm_a.c
Normal file
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* Set the pack buffer type so that we are obtaining memory blocks from
|
||||
the pool dedicated to blocks of A. */ \
|
||||
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
|
||||
\
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||
const dim_t k_pack = k; \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin the
|
||||
packm stage. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
|
||||
\
|
||||
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||
then we need to acquire a block from the memory broker. */ \
|
||||
if ( bli_mem_is_unalloc( mem ) ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Acquire directly to the chief thread's mem_t that was passed in.
|
||||
It needs to be that mem_t struct, and not a local (temporary)
|
||||
mem_t, since there is no barrier until after packing is finished,
|
||||
which could allow a race condition whereby the chief thread exits
|
||||
the current function before the other threads have a chance to
|
||||
copy from it. (A barrier would fix that race condition, but then
|
||||
again, I prefer to keep barriers to a minimum.) */ \
|
||||
bli_membrk_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t to all
|
||||
threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_mem_is_alloc( mem ) ) */ \
|
||||
{ \
|
||||
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||
buffer, then a block has already been acquired from the memory
|
||||
broker and cached by the caller. */ \
|
||||
\
|
||||
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||
associated with a block that is too small compared to the size of
|
||||
the packed matrix buffer that is needed, according to the value
|
||||
computed above. */ \
|
||||
siz_t mem_size = bli_mem_size( mem ); \
|
||||
\
|
||||
if ( mem_size < size_needed ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* The chief thread releases the existing block associated
|
||||
with the mem_t, and then re-acquires a new block, saving
|
||||
the associated mem_t to its passed-in mem_t. (See coment
|
||||
above for why the acquisition needs to be directly to
|
||||
the chief thread's passed-in mem_t and not a local
|
||||
(temporary) mem_t. */ \
|
||||
bli_membrk_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
bli_membrk_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the mem_t entry is already allocated and sufficiently large,
|
||||
then we use it as-is. No action is needed. */ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
|
||||
GENTFUNC( float, s, packm_init_mem_a )
|
||||
GENTFUNC( double, d, packm_init_mem_a )
|
||||
GENTFUNC( scomplex, c, packm_init_mem_a )
|
||||
GENTFUNC( dcomplex, z, packm_init_mem_a )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
if ( thread != NULL ) \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||
is allocated, which it should be. */ \
|
||||
if ( bli_mem_is_alloc( mem ) ) \
|
||||
{ \
|
||||
bli_membrk_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
|
||||
GENTFUNC( float, s, packm_finalize_mem_a )
|
||||
GENTFUNC( double, d, packm_finalize_mem_a )
|
||||
GENTFUNC( scomplex, c, packm_finalize_mem_a )
|
||||
GENTFUNC( dcomplex, z, packm_finalize_mem_a )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||
*k_max = k; \
|
||||
\
|
||||
/* Determine the dimensions and strides for the packed matrix A. */ \
|
||||
{ \
|
||||
/* Pack A to column-stored row-panels. */ \
|
||||
*rs_p = 1; \
|
||||
*cs_p = mr; \
|
||||
\
|
||||
*pd_p = mr; \
|
||||
*ps_p = mr * k; \
|
||||
\
|
||||
/* Set the schema to "packed row panels" to indicate packing to
|
||||
conventional column-stored row panels. */ \
|
||||
*schema = BLIS_PACKED_ROW_PANELS; \
|
||||
} \
|
||||
\
|
||||
/* Set the buffer address provided by the caller to point to the memory
|
||||
associated with the mem_t entry acquired from the memory pool. */ \
|
||||
*p = bli_mem_buffer( mem ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_a )
|
||||
GENTFUNC( float, s, packm_init_a )
|
||||
GENTFUNC( double, d, packm_init_a )
|
||||
GENTFUNC( scomplex, c, packm_init_a )
|
||||
GENTFUNC( dcomplex, z, packm_init_a )
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variant chooser.
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t m_alloc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
pack_t schema; \
|
||||
dim_t m_max; \
|
||||
dim_t k_max; \
|
||||
dim_t pd_p; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. */ \
|
||||
PASTECH2(bls_,ch,packm_init_mem_a) \
|
||||
( \
|
||||
m_alloc, k_alloc, mr, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
mem, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix A. */ \
|
||||
PASTECH2(bls_,ch,packm_init_a) \
|
||||
( \
|
||||
&schema, \
|
||||
m, k, mr, \
|
||||
&m_max, &k_max, \
|
||||
p, rs_p, cs_p, \
|
||||
&pd_p, ps_p, \
|
||||
mem \
|
||||
); \
|
||||
\
|
||||
/* Pack matrix A to the destination buffer chosen above. Here, the packed
|
||||
matrix is stored to column-stored MR x k micropanels. */ \
|
||||
PASTECH2(bls_,ch,packm_var1) \
|
||||
( \
|
||||
conj, \
|
||||
schema, \
|
||||
m, \
|
||||
k, \
|
||||
m_max, \
|
||||
k_max, \
|
||||
kappa, \
|
||||
a, rs_a, cs_a, \
|
||||
*p, *rs_p, *cs_p, \
|
||||
pd_p, *ps_p, \
|
||||
cntx, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_a )
|
||||
GENTFUNC( float, s, packm_a )
|
||||
GENTFUNC( double, d, packm_a )
|
||||
GENTFUNC( scomplex, c, packm_a )
|
||||
GENTFUNC( dcomplex, z, packm_a )
|
||||
|
||||
122
sandbox/gemmlike/bls_l3_packm_a.h
Normal file
122
sandbox/gemmlike/bls_l3_packm_a.h
Normal file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
|
||||
GENTPROT( float, s, packm_init_mem_a )
|
||||
GENTPROT( double, d, packm_init_mem_a )
|
||||
GENTPROT( scomplex, c, packm_init_mem_a )
|
||||
GENTPROT( dcomplex, z, packm_init_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
|
||||
GENTPROT( float, s, packm_finalize_mem_a )
|
||||
GENTPROT( double, d, packm_finalize_mem_a )
|
||||
GENTPROT( scomplex, c, packm_finalize_mem_a )
|
||||
GENTPROT( dcomplex, z, packm_finalize_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_a )
|
||||
GENTPROT( float, s, packm_init_a )
|
||||
GENTPROT( double, d, packm_init_a )
|
||||
GENTPROT( scomplex, c, packm_init_a )
|
||||
GENTPROT( dcomplex, z, packm_init_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t m_alloc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_a )
|
||||
GENTPROT( float, s, packm_a )
|
||||
GENTPROT( double, d, packm_a )
|
||||
GENTPROT( scomplex, c, packm_a )
|
||||
GENTPROT( dcomplex, z, packm_a )
|
||||
|
||||
328
sandbox/gemmlike/bls_l3_packm_b.c
Normal file
328
sandbox/gemmlike/bls_l3_packm_b.c
Normal file
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* Set the pack buffer type so that we are obtaining memory blocks from
|
||||
the pool dedicated to panels of B. */ \
|
||||
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
|
||||
\
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
const dim_t k_pack = k; \
|
||||
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin the
|
||||
packm stage. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
|
||||
\
|
||||
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||
then we need to acquire a block from the memory broker. */ \
|
||||
if ( bli_mem_is_unalloc( mem ) ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Acquire directly to the chief thread's mem_t that was passed in.
|
||||
It needs to be that mem_t struct, and not a local (temporary)
|
||||
mem_t, since there is no barrier until after packing is finished,
|
||||
which could allow a race condition whereby the chief thread exits
|
||||
the current function before the other threads have a chance to
|
||||
copy from it. (A barrier would fix that race condition, but then
|
||||
again, I prefer to keep barriers to a minimum.) */ \
|
||||
bli_membrk_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t to all
|
||||
threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_mem_is_alloc( mem ) ) */ \
|
||||
{ \
|
||||
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||
buffer, then a block has already been acquired from the memory
|
||||
broker and cached by the caller. */ \
|
||||
\
|
||||
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||
associated with a block that is too small compared to the size of
|
||||
the packed matrix buffer that is needed, according to the value
|
||||
computed above. */ \
|
||||
siz_t mem_size = bli_mem_size( mem ); \
|
||||
\
|
||||
if ( mem_size < size_needed ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* The chief thread releases the existing block associated
|
||||
with the mem_t, and then re-acquires a new block, saving
|
||||
the associated mem_t to its passed-in mem_t. (See coment
|
||||
above for why the acquisition needs to be directly to
|
||||
the chief thread's passed-in mem_t and not a local
|
||||
(temporary) mem_t. */ \
|
||||
bli_membrk_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
bli_membrk_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the mem_t entry is already allocated and sufficiently large,
|
||||
then we use it as-is. No action is needed. */ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
|
||||
GENTFUNC( float, s, packm_init_mem_b )
|
||||
GENTFUNC( double, d, packm_init_mem_b )
|
||||
GENTFUNC( scomplex, c, packm_init_mem_b )
|
||||
GENTFUNC( dcomplex, z, packm_init_mem_b )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
if ( thread != NULL ) \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||
is allocated, which it should be. */ \
|
||||
if ( bli_mem_is_alloc( mem ) ) \
|
||||
{ \
|
||||
bli_membrk_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
|
||||
GENTFUNC( float, s, packm_finalize_mem_b )
|
||||
GENTFUNC( double, d, packm_finalize_mem_b )
|
||||
GENTFUNC( scomplex, c, packm_finalize_mem_b )
|
||||
GENTFUNC( dcomplex, z, packm_finalize_mem_b )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
*k_max = k; \
|
||||
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||
\
|
||||
/* Determine the dimensions and strides for the packed matrix B. */ \
|
||||
{ \
|
||||
/* Pack B to row-stored column-panels. */ \
|
||||
*rs_p = nr; \
|
||||
*cs_p = 1; \
|
||||
\
|
||||
*pd_p = nr; \
|
||||
*ps_p = k * nr; \
|
||||
\
|
||||
/* Set the schema to "packed column panels" to indicate packing to
|
||||
conventional row-stored column panels. */ \
|
||||
*schema = BLIS_PACKED_COL_PANELS; \
|
||||
} \
|
||||
\
|
||||
/* Set the buffer address provided by the caller to point to the memory
|
||||
associated with the mem_t entry acquired from the memory pool. */ \
|
||||
*p = bli_mem_buffer( mem ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_b )
|
||||
GENTFUNC( float, s, packm_init_b )
|
||||
GENTFUNC( double, d, packm_init_b )
|
||||
GENTFUNC( scomplex, c, packm_init_b )
|
||||
GENTFUNC( dcomplex, z, packm_init_b )
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variant chooser.
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t k_alloc, \
|
||||
dim_t n_alloc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
pack_t schema; \
|
||||
dim_t k_max; \
|
||||
dim_t n_max; \
|
||||
dim_t pd_p; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. */ \
|
||||
PASTECH2(bls_,ch,packm_init_mem_b) \
|
||||
( \
|
||||
k_alloc, n_alloc, nr, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
mem, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix B. */ \
|
||||
PASTECH2(bls_,ch,packm_init_b) \
|
||||
( \
|
||||
&schema, \
|
||||
k, n, nr, \
|
||||
&k_max, &n_max, \
|
||||
p, rs_p, cs_p, \
|
||||
&pd_p, ps_p, \
|
||||
mem \
|
||||
); \
|
||||
\
|
||||
/* Pack matrix B to the destination buffer chosen above. Here, the packed
|
||||
matrix is stored to row-stored k x NR micropanels. */ \
|
||||
PASTECH2(bls_,ch,packm_var1) \
|
||||
( \
|
||||
conj, \
|
||||
schema, \
|
||||
k, \
|
||||
n, \
|
||||
k_max, \
|
||||
n_max, \
|
||||
kappa, \
|
||||
b, rs_b, cs_b, \
|
||||
*p, *rs_p, *cs_p, \
|
||||
pd_p, *ps_p, \
|
||||
cntx, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_b )
|
||||
GENTFUNC( float, s, packm_b )
|
||||
GENTFUNC( double, d, packm_b )
|
||||
GENTFUNC( scomplex, c, packm_b )
|
||||
GENTFUNC( dcomplex, z, packm_b )
|
||||
|
||||
122
sandbox/gemmlike/bls_l3_packm_b.h
Normal file
122
sandbox/gemmlike/bls_l3_packm_b.h
Normal file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
|
||||
GENTPROT( float, s, packm_init_mem_b )
|
||||
GENTPROT( double, d, packm_init_mem_b )
|
||||
GENTPROT( scomplex, c, packm_init_mem_b )
|
||||
GENTPROT( dcomplex, z, packm_init_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
|
||||
GENTPROT( float, s, packm_finalize_mem_b )
|
||||
GENTPROT( double, d, packm_finalize_mem_b )
|
||||
GENTPROT( scomplex, c, packm_finalize_mem_b )
|
||||
GENTPROT( dcomplex, z, packm_finalize_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_b )
|
||||
GENTPROT( float, s, packm_init_b )
|
||||
GENTPROT( double, d, packm_init_b )
|
||||
GENTPROT( scomplex, c, packm_init_b )
|
||||
GENTPROT( dcomplex, z, packm_init_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t k_alloc, \
|
||||
dim_t n_alloc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_b )
|
||||
GENTPROT( float, s, packm_b )
|
||||
GENTPROT( double, d, packm_b )
|
||||
GENTPROT( scomplex, c, packm_b )
|
||||
GENTPROT( dcomplex, z, packm_b )
|
||||
|
||||
198
sandbox/gemmlike/bls_l3_packm_var.c
Normal file
198
sandbox/gemmlike/bls_l3_packm_var.c
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variants.
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic; \
|
||||
dim_t ic0; \
|
||||
doff_t ic_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool col_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
} \
|
||||
\
|
||||
ctype* restrict p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||
( void )nt; \
|
||||
( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
ctype* restrict c_use = c_begin; \
|
||||
ctype* restrict p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. (The
|
||||
default is slab.) */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_cxk) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim_i, \
|
||||
panel_dim_max, \
|
||||
panel_len_i, \
|
||||
panel_len_max_i, \
|
||||
kappa_cast, \
|
||||
c_use, vs_c, ldc, \
|
||||
p_use, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
p_begin += ps_p; \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_var1 )
|
||||
GENTFUNC( float, s, packm_var1 )
|
||||
GENTFUNC( double, d, packm_var1 )
|
||||
GENTFUNC( scomplex, c, packm_var1 )
|
||||
GENTFUNC( dcomplex, z, packm_var1 )
|
||||
|
||||
63
sandbox/gemmlike/bls_l3_packm_var.h
Normal file
63
sandbox/gemmlike/bls_l3_packm_var.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces to the variants.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bls_,ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_var1 )
|
||||
GENTPROT( float, s, packm_var1 )
|
||||
GENTPROT( double, d, packm_var1 )
|
||||
GENTPROT( scomplex, c, packm_var1 )
|
||||
GENTPROT( dcomplex, z, packm_var1 )
|
||||
|
||||
73
sandbox/gemmlike/thread/bls_l3_decor.h
Normal file
73
sandbox/gemmlike/thread/bls_l3_decor.h
Normal file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_H
|
||||
#define BLIS_SBX_L3_DECOR_H
|
||||
|
||||
// -- sup definitions ----------------------------------------------------------
|
||||
|
||||
// Level-3 sup internal function type.
|
||||
typedef void (*l3sbxint_t)
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 sup thread decorator prototype.
|
||||
void bls_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
// Include definitions specific to the method of multithreading.
|
||||
#include "bls_l3_decor_single.h"
|
||||
#include "bls_l3_decor_openmp.h"
|
||||
#include "bls_l3_decor_pthreads.h"
|
||||
|
||||
#endif
|
||||
|
||||
138
sandbox/gemmlike/thread/bls_l3_decor_openmp.c
Normal file
138
sandbox/gemmlike/thread/bls_l3_decor_openmp.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
// Define a dummy thread entry function, which is needed in the pthreads
|
||||
// version, so that when building Windows DLLs (with OpenMP enabled or with
|
||||
// no multithreading) we don't risk having an unresolved symbol.
|
||||
void* bls_l3_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
//#define PRINT_THRINFO
|
||||
|
||||
void bls_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
sandbox/gemmlike/thread/bls_l3_decor_openmp.h
Normal file
44
sandbox/gemmlike/thread/bls_l3_decor_openmp.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
|
||||
#define BLIS_SBX_L3_DECOR_OPENMP_H
|
||||
|
||||
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
213
sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
Normal file
213
sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
Normal file
@@ -0,0 +1,213 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// A data structure to assist in passing operands to additional threads.
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3sbxint_t func;
|
||||
opid_t family;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* b;
|
||||
obj_t* beta;
|
||||
obj_t* c;
|
||||
cntx_t* cntx;
|
||||
rntm_t* rntm;
|
||||
dim_t tid;
|
||||
thrcomm_t* gl_comm;
|
||||
array_t* array;
|
||||
} thread_data_t;
|
||||
|
||||
// Entry point function for additional threads.
|
||||
void* bls_l3_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
l3sbxint_t func = data->func;
|
||||
opid_t family = data->family;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* b = data->b;
|
||||
obj_t* beta = data->beta;
|
||||
obj_t* c = data->c;
|
||||
cntx_t* cntx = data->cntx;
|
||||
rntm_t* rntm = data->rntm;
|
||||
dim_t tid = data->tid;
|
||||
array_t* array = data->array;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
( void )family;
|
||||
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bls_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// Query the total number of threads from the context.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||
// to the thread entry functions.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
||||
|
||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||
// can spawn all other threads before proceeding with its own computation.
|
||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||
{
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[tid].func = func;
|
||||
datas[tid].family = family;
|
||||
datas[tid].alpha = alpha;
|
||||
datas[tid].a = a;
|
||||
datas[tid].b = b;
|
||||
datas[tid].beta = beta;
|
||||
datas[tid].c = c;
|
||||
datas[tid].cntx = cntx;
|
||||
datas[tid].rntm = rntm;
|
||||
datas[tid].tid = tid;
|
||||
datas[tid].gl_comm = gl_comm;
|
||||
datas[tid].array = array;
|
||||
|
||||
// Spawn additional threads for ids greater than 1.
|
||||
if ( tid != 0 )
|
||||
bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] );
|
||||
else
|
||||
bls_l3_thread_entry( ( void* )(&datas[0]) );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Thread 0 waits for additional threads to finish.
|
||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||
{
|
||||
bli_pthread_join( pthreads[tid], NULL );
|
||||
}
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( pthreads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( datas );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
47
sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
Normal file
47
sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
|
||||
#define BLIS_SBX_L3_DECOR_PTHREADS_H
|
||||
|
||||
// Definitions specific to situations when POSIX multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Thread entry point prototype.
|
||||
void* bls_l3_thread_entry( void* data_void );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
141
sandbox/gemmlike/thread/bls_l3_decor_single.c
Normal file
141
sandbox/gemmlike/thread/bls_l3_decor_single.c
Normal file
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#define SKIP_THRINFO_TREE
|
||||
|
||||
void bls_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
// There is only one thread id (for the thief thread).
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
#else
|
||||
// This optimization allows us to use one of the global thrinfo_t
|
||||
// objects for single-threaded execution rather than grow one from
|
||||
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
|
||||
// from within the variants, will immediately return if it detects
|
||||
// that the thrinfo_t* passed into it is either
|
||||
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
|
||||
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
|
||||
|
||||
( void )tid;
|
||||
#endif
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
sandbox/gemmlike/thread/bls_l3_decor_single.h
Normal file
44
sandbox/gemmlike/thread/bls_l3_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
|
||||
#define BLIS_SBX_L3_DECOR_SINGLE_H
|
||||
|
||||
// Definitions specific to situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -32,7 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
// This file is needed for the BLIS build system.
|
||||
// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
|
||||
// entry point to any sandbox implementation.
|
||||
|
||||
// NOTE: This function is implemented identically to the function that it
|
||||
// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
|
||||
// forgoing the option of customizing the implementations that underlie
|
||||
// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
|
||||
// directory, however, will be included in the BLIS.
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ function r_val = plot_panel_4x5 ...
|
||||
thr_str, ...
|
||||
dirpath, ...
|
||||
arch_str, ...
|
||||
vend_str ...
|
||||
vend_leg_str ...
|
||||
)
|
||||
|
||||
impl = 'octave';
|
||||
@@ -25,11 +25,13 @@ else
|
||||
position = [100 100 1864 1540];
|
||||
papersize = [15.6 19.4];
|
||||
%leg_pos_st = [1.15 8.70 2.1 1.2 ]; % (dgemm)
|
||||
leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
|
||||
%leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
|
||||
leg_pos_st = [15.90 13.60 2.1 1.2 ]; % (strsm)
|
||||
%leg_pos_mt = [12.20 13.60 2.1 1.2 ]; % (strmm)
|
||||
%leg_pos_mt = [5.30 12.60 2.1 1.2 ]; % (ssymm)
|
||||
%leg_pos_mt = [8.50 13.62 2.1 1.2 ]; % (ssyrk)
|
||||
leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
|
||||
%leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
|
||||
leg_pos_mt = [15.90 13.60 2.1 1.2 ]; % (strsm)
|
||||
sp_margins = [ 0.068 0.051 ];
|
||||
end
|
||||
|
||||
@@ -59,7 +61,7 @@ eige_str = 'eigen';
|
||||
|
||||
% Create filename "templates" for the files that contain the performance
|
||||
% results.
|
||||
filetemp = '%s/output_%s_%s_%s.m'
|
||||
filetemp = '%s/output_%s_%s_%s.m';
|
||||
filetemp_blis = sprintf( filetemp, '%s', '%s', '%s', blis_str );
|
||||
filetemp_open = sprintf( filetemp, '%s', '%s', '%s', open_str );
|
||||
filetemp_vend = sprintf( filetemp, '%s', '%s', '%s', vend_str );
|
||||
@@ -102,7 +104,7 @@ for opi = 1:n_opnames
|
||||
data_blis, ...
|
||||
data_open, ...
|
||||
data_eige, ...
|
||||
data_vend, vend_str, ...
|
||||
data_vend, vend_leg_str, ...
|
||||
nth, ...
|
||||
4, 5, ...
|
||||
cfreq, ...
|
||||
|
||||
@@ -24,7 +24,6 @@ plot_panel_4x5(2.60,16,64, '1s','../results/zen2/20200929/jc4ic4jr4','zen2','MKL
|
||||
plot_panel_4x5(2.60,16,128,'2s','../results/zen2/20200929/jc8ic4jr4','zen2','MKL'); close all; clear all;
|
||||
|
||||
% a64fx
|
||||
plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210316/st', 'a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210316/jc1ic4jr3', 'a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210316/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
|
||||
plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210520/st', 'a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210520/jc1ic1jr12','a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210520/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;
|
||||
|
||||
@@ -254,18 +254,17 @@ void libblis_test_gemm_experiment
|
||||
bli_setsc( 0.9, 1.0, &beta );
|
||||
}
|
||||
|
||||
#if 0
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
bli_setsc( 1.0, 0.0, &alpha );
|
||||
bli_setsc( 1.0, 0.0, &beta );
|
||||
#endif
|
||||
|
||||
// Randomize A, B, and C, and save C.
|
||||
libblis_test_mobj_randomize( params, TRUE, &a );
|
||||
libblis_test_mobj_randomize( params, TRUE, &b );
|
||||
libblis_test_mobj_randomize( params, TRUE, &c );
|
||||
bli_copym( &c, &c_save );
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
//bli_setsc( 1.0, 0.0, &alpha );
|
||||
//bli_setsc( 0.0, 0.0, &beta );
|
||||
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
//bli_setsc( 1.0, 0.0, &alpha );
|
||||
//bli_setsc( 0.0, 0.0, &beta );
|
||||
|
||||
// Apply the parameters.
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
@@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" );
|
||||
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
|
||||
//bli_printm( "c before", c, "%6.3f", "" );
|
||||
bli_gemm( alpha, a, b, beta, c );
|
||||
//bls_gemm( alpha, a, b, beta, c );
|
||||
#if 0
|
||||
if ( bli_obj_length( c ) == 12 &&
|
||||
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
|
||||
bli_printm( "c after", c, "%6.3f", "" );
|
||||
#endif
|
||||
//bli_printm( "c after", c, "%5.2f", "" );
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user