Merge branch 'master' into dev

This commit is contained in:
Field G. Van Zee
2021-06-13 19:44:14 -05:00
81 changed files with 8915 additions and 79 deletions

View File

@@ -1,40 +1,50 @@
language: c
sudo: required
dist: trusty
dist: focal
matrix:
include:
# full testsuite (all tests except for mixed datatype)
- os: linux
compiler: gcc
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \
PACKAGES="gcc-8 binutils"
# mixed-datatype testsuite (gemm_nn only)
- os: linux
compiler: gcc
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" \
PACKAGES="gcc-8 binutils"
# salt testsuite (fast set of operations+parameters)
- os: linux
compiler: gcc
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" \
PACKAGES="gcc-8 binutils"
# test x86_64 ukrs with SDE
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64"
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" \
PACKAGES="gcc-8 binutils"
# openmp build
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto"
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" \
PACKAGES="gcc-8 binutils"
# pthreads build
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto"
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" \
PACKAGES="gcc-8 binutils"
# out-of-tree build
- os: linux
compiler: gcc
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto"
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" \
PACKAGES="gcc-8 binutils"
# clang build
- os: linux
compiler: clang
env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto"
# There seems to be some difficulty installing 2 Clang toolchains of different versions.
# Use the TravisCI default.
# PACKAGES="clang-8 binutils"
# macOS with system compiler (clang)
- os: osx
compiler: clang
@@ -43,29 +53,23 @@ matrix:
- os: linux
compiler: arm-linux-gnueabihf-gcc
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \
PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \
PACKAGES="gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/"
# cortexa57 build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \
PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \
PACKAGES="gcc-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
# armsve build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc-10
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \
PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
install:
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi
- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-6
- binutils-2.26
- clang
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
script:
- export DIST_PATH=.
- pwd
@@ -76,5 +80,7 @@ script:
- ls -l
- $CC --version
- make -j 2
# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi

View File

@@ -104,6 +104,7 @@ but many others have contributed code and feedback, including
Costas Yamin @cosstas
Chenhan Yu @ChenhanYu (The University of Texas at Austin)
Roman Yurchak @rth (Symerio)
Stefano Zampini @stefanozampini
M. Zhou @cdluminate
BLIS's development was partially funded by grants from industry

View File

@@ -461,7 +461,7 @@ endif
flat-header: check-env $(BLIS_H_FLAT)
$(BLIS_H_FLAT): $(FRAME_H99_FILES)
$(BLIS_H_FLAT): $(ALL_H99_FILES)
ifeq ($(ENABLE_VERBOSE),yes)
$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
else

View File

@@ -13,6 +13,7 @@ Contents
* **[Key Features](#key-features)**
* **[How to Download BLIS](#how-to-download-blis)**
* **[Getting Started](#getting-started)**
* **[Performance](#performance)**
* **[Documentation](#documentation)**
* **[External Packages](#external-packages)**
* **[Discussion](#discussion)**
@@ -393,6 +394,24 @@ If/when you have time, we *strongly* encourage you to read the detailed
walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
guide.
Performance
-----------
We provide graphs that report performance of several implementations across a
range of hardware types, multithreading configurations, problem sizes,
operations, and datatypes. These pages also document most of the details needed
to reproduce these experiments.
* **[Performance](docs/Performance.md).** This document reports empirically
measured performance of a representative set of level-3 operations on a variety
of hardware architectures, as implemented within BLIS and other BLAS libraries
for all four of the standard floating-point datatypes.
* **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
empirically measured performance of `gemm` on select hardware architectures
within BLIS and other BLAS libraries when performing matrix problems where one
or two dimensions is exceedingly small.
Documentation
-------------

View File

@@ -202,12 +202,6 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))
# Define a function that removes duplicate words from a list.
# NOTE: This function was obtained via [1]; thanks bobbogo for this
# concise definition.
# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting
rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1)))
#
# --- Include makefile configuration file --------------------------------------

View File

@@ -0,0 +1,117 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// A64FX: set up cache sizes
//
// Reference: A64FX (TM) specification Fujitsu HPC Extension
// Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
//
// 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 |
// RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
//
// the bits set number of maximum sectors from 0-7
// 000 - 0
// 001 - 1
// 010 - 2
// 011 - 3
// 100 - 4
// 101 - 5
// 110 - 6
// 111 - 7
//
// For L1 we want to maximize the number of sectors for B
// Configuration 1: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 6 sectors for B (sector 2)
// 0 sectors for the rest (sector 0)
//
// 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
//
// Configuration 2: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 5 sectors for B (sector 2)
// 1 sectors for the rest (sector 0)
//
// 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
//
// accessing the control register:
//
// MRS <Xt>, S3_3_C11_C8_2
// MSR S3_3_C11_C8_2, <Xt>
//
// TODO: First tests showed no change in performance, a deeper investigation
// is necessary
#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c11_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c15_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
" mov "#sparereg", "#tag" \n\t"\
" lsl "#sparereg", "#sparereg", 56 \n\t"\
" orr "#areg", "#areg", "#sparereg" \n\t"
#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
__asm__ volatile(\
"mrs %["#output_uint64"],s3_3_c11_c8_2"\
: [output_uint64] "=r" (output_uint64)\
: \
:\
);
#define A64FX_SCC(sec0,sec1,sec2,sec3)\
(uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
#define A64FX_SCC_L2(sec02,sec13)\
(uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))

View File

@@ -0,0 +1,151 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_a64fx_sector_cache.h"
void bli_cntx_init_a64fx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_a64fx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set SVE-512 packing routine.
bli_cntx_set_packm_kers
(
3,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
// Set A64FX cache sector sizes for each PE/CMG
// SC Fugaku might disable users' setting cache sizes.
#if !defined(CACHE_SECTOR_SIZE_READONLY)
#pragma omp parallel
{
A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
}
#endif
}

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_NUM_REGISTERS 32
//#endif

82
config/a64fx/make_defs.mk Normal file
View File

@@ -0,0 +1,82 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := a64fx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -0,0 +1,92 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dim_t bli_vl_bits_armsve(void)
{ \
uint64_t vl = 0;
__asm__ (
" mov x0, xzr \n\t"
" incb x0 \n\t"
" mov %[vl], x0 \n\t"
: [vl] "=r" (vl)
:
: "x0"
);
return vl;
}
#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
{ \
dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
\
dim_t vl_b = bli_vl_bits_armsve(); \
dim_t vl = vl_b / S_Data; \
dim_t m_r = 2 * vl; \
dim_t n_r = 10; \
\
dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
/ (n_r * S_Data); \
\
dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
m_c -= m_c % m_r; \
\
dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
n_c -= n_c % n_r; \
\
*m_r_ = m_r; \
*n_r_ = n_r; \
*k_c_ = k_c; \
*m_c_ = m_c; \
*n_c_ = n_c; \
}
EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )

View File

@@ -0,0 +1,42 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dim_t bli_vl_bits_armsve(void);
void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);

View File

@@ -0,0 +1,157 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_armsve_config_utils.h"
void bli_cntx_init_armsve( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
#if 0
blksz_t thresh[ BLIS_NUM_THRESH ];
#endif
// Set default kernel blocksizes and functions.
bli_cntx_init_armsve_ref( cntx );
// -------------------------------------------------------------------------
// Block size.
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set VL-specific packing routines if applicable.
if (m_r_d==16)
bli_cntx_set_packm_kers
(
3,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
else if (m_r_d==8)
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
}

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_NUM_REGISTERS 32
// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256
//#endif

View File

@@ -0,0 +1,82 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := armsve
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -32,6 +32,8 @@ piledriver: piledriver
bulldozer: bulldozer
# ARM architectures.
armsve: armsve/armsve
a64fx: a64fx/armsve
thunderx2: thunderx2/armv8a
cortexa57: cortexa57/armv8a
cortexa53: cortexa53/armv8a

5
configure vendored
View File

@@ -2373,6 +2373,11 @@ main()
fi
echo "${script_name}: using '${found_cc}' C compiler."
# Also check the compiler to see if we are (cross-)compiling for Windows
if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
is_win=yes
fi
# -- Find a C++ compiler ---------------------------------------------------

View File

@@ -154,7 +154,7 @@ Originally, BLIS did indeed require the application to explicitly setup (initial
Yes! BLIS supports multithreading (via OpenMP or POSIX threads) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide.
BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives is thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives its thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
### Does BLIS support NUMA environments?

View File

@@ -21,6 +21,9 @@
* **[Zen2](Performance.md#zen2)**
* **[Experiment details](Performance.md#zen2-experiment-details)**
* **[Results](Performance.md#zen2-results)**
* **[A64fx](Performance.md#a64fx)**
* **[Experiment details](Performance.md#a64fx-experiment-details)**
* **[Results](Performance.md#a64fx-results)**
* **[Feedback](Performance.md#feedback)**
# Introduction
@@ -526,6 +529,78 @@ The `runthese.m` file will contain example invocations of the function.
---
## A64fx
### A64fx experiment details
* Location: RIKEN Center of Computational Science in Kobe, Japan
* These test results were gathered on the Fugaku supercomputer under project "量子物質の創発と機能のための基礎科学 ―「富岳」と最先端実験の密連携による革新的強相関電子科学" (hp200132) (Basic Science for Emergence and Functionality in Quantum Matter: Innovative Strongly-Correlated Electron Science by Integration of "Fugaku" and Frontier Experiments)
* Processor model: Fujitsu A64fx
* Core topology: one socket, 4 NUMA groups per socket, 13 cores per group (one reserved for the OS), 48 cores total
* SMT status: Unknown
* Max clock rate: 2.2GHz (single- and multicore, observed)
* Max vector register length: 512 bits (SVE)
* Max FMA vector IPC: 2
* Peak performance:
* single-core: 70.4 GFLOPS (double-precision), 140.8 GFLOPS (single-precision)
* multicore: 70.4 GFLOPS/core (double-precision), 140.8 GFLOPS/core (single-precision)
* Operating system: RHEL 8.3
* Page size: 256 bytes
* Compiler: gcc 10.1.0
* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021
* Implementations tested:
* BLIS 61584de (post-0.8.1)
* configured with:
* `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded)
* `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded)
* sub-configuration exercised: `a64fx`
* Single-threaded (1 core) execution requested via no change in environment variables
* Multithreaded (12 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=12`
* Multithreaded (48 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=4 BLIS_JR_NT=12`
* Eigen 3.3.9
* Obtained via the [Eigen GitLab homepage](https://gitlab.com/libeigen/eigen)
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
* ARMPL (20.1.0 for A64fx)
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
* **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether.
* Fujitsu SSL2 (Fujitsu toolchain 1.2.31)
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1`
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12`
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48`
* Affinity:
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="12-23 24-35 36-47 48-59"`.
* All executables were run through `numactl --interleave=all` (multithreaded only).
* Frequency throttling: No change made. No frequency lowering observed.
* Comments:
* Special thanks to Stepan Nassyr and RuQing G. Xu for their work in developing and optimizing A64fx support. Also, thanks to RuQing G. Xu for collecting the data that appear in these graphs.
### A64fx results
#### pdf
* [A64fx single-threaded](graphs/large/l3_perf_a64fx_nt1.pdf)
* [A64fx multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf)
* [A64fx multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf)
#### png (inline)
* **A64fx single-threaded**
![single-threaded](graphs/large/l3_perf_a64fx_nt1.png)
* **A64fx multithreaded (12 cores)**
![multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png)
* **A64fx multithreaded (48 cores)**
![multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png)
---
# Feedback
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

View File

@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This is "rounding up" of the last upanel is actually optional
/* NOTE: This "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same

View File

@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This is "rounding up" of the last upanel is actually optional
/* NOTE: This "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same
@@ -280,15 +280,15 @@ void PASTEMAC(ch,opname) \
} \
else \
{ \
/* All other stor3_t ids: pack A to column-stored row-panels. */ \
/* All other stor3_t ids: pack B to row-stored column-panels. */ \
*rs_p = nr; \
*cs_p = 1; \
\
*pd_p = nr; \
*ps_p = k * nr; \
\
/* Set the schema to "packed row panels" to indicate packing to
conventional column-stored row panels. */ \
/* Set the schema to "packed column panels" to indicate packing to
conventional row-stored column panels. */ \
*schema = BLIS_PACKED_COL_PANELS; \
} \
\

View File

@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
#endif
// ARM microarchitectures.
#ifdef BLIS_FAMILY_ARMSVE
id = BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_FAMILY_A64FX
id = BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_FAMILY_THUNDERX2
id = BLIS_ARCH_THUNDERX2;
#endif
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
"thunderx2",
"cortexa57",
"cortexa53",
"armsve",
"a64fx",
"cortexa15",
"cortexa9",

View File

@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" );
printf("family = %x\n", family );
printf( "model = %x\n", model );
printf( "features = %x\n", features );
#endif
@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
{
// Check for each ARMv8 configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_ARMSVE
if ( bli_cpuid_is_armsve( model, part, features ) )
return BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_CONFIG_A64FX
if ( bli_cpuid_is_a64fx( model, part, features ) )
return BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_CONFIG_THUNDERX2
if ( bli_cpuid_is_thunderx2( model, part, features ) )
return BLIS_ARCH_THUNDERX2;
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
return TRUE;
}
bool bli_cpuid_is_armsve
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_a64fx
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_cortexa15
(
uint32_t family,
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
strstr( feat_str, "asimd" ) != NULL )
*features |= FEATURE_NEON;
// Parse the feature string to check for SVE features.
if ( strstr( feat_str, "sve" ) != NULL )
*features |= FEATURE_SVE;
//printf( "bli_cpuid_query(): features var: %u\n", *features );
// Parse the processor string to uncover the model.

View File

@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
@@ -175,7 +177,8 @@ enum
};
enum
{
FEATURE_NEON = 0x1
FEATURE_NEON = 0x01,
FEATURE_SVE = 0x02
};
#endif

View File

@@ -144,6 +144,16 @@ void bli_gks_init( void )
bli_cntx_init_cortexa53_ref,
bli_cntx_init_cortexa53_ind );
#endif
#ifdef BLIS_CONFIG_ARMSVE
bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve,
bli_cntx_init_armsve_ref,
bli_cntx_init_armsve_ind );
#endif
#ifdef BLIS_CONFIG_A64FX
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
bli_cntx_init_a64fx_ref,
bli_cntx_init_a64fx_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA15
bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15,
bli_cntx_init_cortexa15_ref,

View File

@@ -1,12 +1,14 @@
/* dlamch.f -- translated by f2c (version 19991025).
You must link the resulting object file with the libraries:
-lf2c -lm (in that order)
*/
#include "blis.h"
#include <float.h>
#include <fenv.h>
#include <ctype.h>
#ifdef __cplusplus
extern "C" {
#endif
#include "blis.h"
#ifdef BLIS_ENABLE_LEGACY_LAMCH
double bli_pow_di( bla_double* a, bla_integer* n );
@@ -1027,6 +1029,59 @@ L10:
} /* bli_dlamc5_ */
#ifdef __cplusplus
#else
bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len)
{
/* = 'E' or 'e', DLAMCH := eps */
/* = 'S' or 's , DLAMCH := sfmin */
/* = 'B' or 'b', DLAMCH := base */
/* = 'P' or 'p', DLAMCH := eps*base */
/* = 'N' or 'n', DLAMCH := t */
/* = 'R' or 'r', DLAMCH := rnd */
/* = 'M' or 'm', DLAMCH := emin */
/* = 'U' or 'u', DLAMCH := rmin */
/* = 'L' or 'l', DLAMCH := emax */
/* = 'O' or 'o', DLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
double safe_min = DBL_MIN;
double small = 1.0f / DBL_MAX;
if ( small >= safe_min )
safe_min = small * ( 1.0 + DBL_EPSILON );
switch ( toupper( *cmach ) )
{
case 'E': return DBL_EPSILON;
case 'S': return safe_min;
case 'B': return FLT_RADIX;
case 'P': return FLT_RADIX*DBL_EPSILON;
case 'N': return DBL_MANT_DIG;
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0;
case 'M': return DBL_MIN_EXP;
case 'U': return DBL_MIN;
case 'L': return DBL_MAX_EXP;
case 'O': return DBL_MAX;
}
return 0.0;
}
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -1,12 +1,14 @@
/* slamch.f -- translated by f2c (version 19991025).
You must link the resulting object file with the libraries:
-lf2c -lm (in that order)
*/
#include "blis.h"
#include <float.h>
#include <fenv.h>
#include <ctype.h>
#ifdef __cplusplus
extern "C" {
#endif
#include "blis.h"
#ifdef BLIS_ENABLE_LEGACY_LAMCH
double bli_pow_ri( bla_real* a, bla_integer* n );
@@ -1022,6 +1024,59 @@ L10:
} /* bli_slamc5_ */
#ifdef __cplusplus
#else
bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len)
{
/* = 'E' or 'e', SLAMCH := eps */
/* = 'S' or 's , SLAMCH := sfmin */
/* = 'B' or 'b', SLAMCH := base */
/* = 'P' or 'p', SLAMCH := eps*base */
/* = 'N' or 'n', SLAMCH := t */
/* = 'R' or 'r', SLAMCH := rnd */
/* = 'M' or 'm', SLAMCH := emin */
/* = 'U' or 'u', SLAMCH := rmin */
/* = 'L' or 'l', SLAMCH := emax */
/* = 'O' or 'o', SLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
float safe_min = FLT_MIN;
float small = 1.0f / FLT_MAX;
if ( small >= safe_min )
safe_min = small * ( 1.0f + FLT_EPSILON );
switch ( toupper( *cmach ) )
{
case 'E': return FLT_EPSILON;
case 'S': return safe_min;
case 'B': return FLT_RADIX;
case 'P': return FLT_RADIX*FLT_EPSILON;
case 'N': return FLT_MANT_DIG;
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f;
case 'M': return FLT_MIN_EXP;
case 'U': return FLT_MIN;
case 'L': return FLT_MAX_EXP;
case 'O': return FLT_MAX;
}
return 0.0f;
}
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )
// -- ARM architectures --
#ifdef BLIS_CONFIG_ARMSVE
CNTX_INIT_PROTS( armsve )
#endif
#ifdef BLIS_CONFIG_A64FX
CNTX_INIT_PROTS( a64fx )
#endif
#ifdef BLIS_CONFIG_THUNDERX2
CNTX_INIT_PROTS( thunderx2 )
#endif
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )
// -- ARM architectures --
#ifdef BLIS_FAMILY_ARMSVE
#include "bli_family_armsve.h"
#endif
#ifdef BLIS_FAMILY_A64FX
#include "bli_family_a64fx.h"
#endif
#ifdef BLIS_FAMILY_THUNDERX2
#include "bli_family_thunderx2.h"
#endif

View File

@@ -128,6 +128,20 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
// -- One-operand macro (with custom prefix) --
#define GENARRAY_PREF(arrayname,prefix,op) \
\
arrayname[BLIS_NUM_FP_TYPES] = \
{ \
PASTECH2(prefix,s,op), \
PASTECH2(prefix,c,op), \
PASTECH2(prefix,d,op), \
PASTECH2(prefix,z,op) \
}
// -- Two-operand macros --

View File

@@ -1190,7 +1190,7 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
// -- Initialization-related macros --
// Finish the initialization started by the matrix-specific static initializer
// (e.g. BLIS_OBJECT_PREINITIALIZER)
// (e.g. BLIS_OBJECT_INITIALIZER)
// NOTE: This is intended only for use in the BLAS compatibility API and typed
// BLIS API.
@@ -1223,7 +1223,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
}
// Finish the initialization started by the 1x1-specific static initializer
// (e.g. BLIS_OBJECT_PREINITIALIZER_1X1)
// (e.g. BLIS_OBJECT_INITIALIZER_1X1)
// NOTE: This is intended only for use in the BLAS compatibility API and typed
// BLIS API.

View File

@@ -1008,6 +1008,8 @@ typedef enum
BLIS_ARCH_BULLDOZER,
// ARM
BLIS_ARCH_ARMSVE,
BLIS_ARCH_A64FX,
BLIS_ARCH_THUNDERX2,
BLIS_ARCH_CORTEXA57,
BLIS_ARCH_CORTEXA53,
@@ -1032,7 +1034,7 @@ typedef enum
// NOTE: This value must be updated to reflect the number of enum values
// listed above for arch_t!
#define BLIS_NUM_ARCHS 22
//#define BLIS_NUM_ARCHS 25
//

View File

@@ -885,6 +885,8 @@
#define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2)
#define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2)
#define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2)
#define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2)
#define VHSUBPS(_0, _1, _2) INSTR_(vhsubps, _0, _1, _2)
#define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2)
#define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2)
#define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2)
@@ -1015,6 +1017,8 @@
#define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2)
#define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2)
#define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2)
#define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2)
#define vhsubps(_0, _1, _2) VHSUBPS(_0, _1, _2)
#define vaddps(_0, _1, _2) VADDPS(_0, _1, _2)
#define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2)
#define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2)

View File

@@ -0,0 +1,45 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
"trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
"trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
"compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
"compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
"compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
"compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
"compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
"compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"

View File

@@ -0,0 +1,97 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
"ptrue " #PT".d \n\t" \
"mov " #XTMP", #2 \n\t" \
"whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
"mov " #XTMP", #4 \n\t" \
"whilelo " #P4".d, xzr, " #XTMP" \n\t" \
"mov " #XTMP", #6 \n\t" \
"whilelo " #P6".d, xzr, " #XTMP" \n\t" \
\
"eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
"orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
\
"not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
"not " #P4C".b, " #PT"/z, " #P4".b \n\t" \
"not " #P6C".b, " #PT"/z, " #P6".b \n\t" \
#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
"trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
"trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
"trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
"trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
"trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
"trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
"trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
"trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
\
"compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
"compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
"ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
"ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
"compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
"compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
"ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
\
"sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
"sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
"sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
"sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
"sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
"sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
"sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
"sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
\
"compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
"compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
"compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
"compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
"ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
"ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
"ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
\
"sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
"sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
"sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
"sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
"sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
"sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
"sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
"sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"

View File

@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
dim_t cdim_,
dim_t n_,
dim_t n_max_,
void* restrict kappa_,
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
double* a = ( double* )a_;
double* p = ( double* )p_;
double* kappa = ( double* )kappa_;
const int64_t cdim = cdim_;
const int64_t mnr = 8;
const int64_t n = n_;

View File

@@ -0,0 +1,365 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
#include "armsve512_asm_transpose_d8x2.h"
// assumption:
// SVE vector length = 512 bits.
void bli_dpackm_armsve512_asm_10xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 10;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
const bool gs = inca != 1 && lda != 1;
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif
if ( cdim == mnr && !gs && unitk )
{
uint64_t n_mker = n / 8;
uint64_t n_left = n % 8;
__asm__ volatile (
"mov x0, %[a] \n\t"
"mov x1, %[p] \n\t"
"mov x2, %[ldp] \n\t"
"mov x3, %[lda] \n\t"
"mov x4, %[inca] \n\t"
"cmp x4, #1 \n\t"
// Skips by sizeof(double).
"mov x8, #8 \n\t"
"madd x2, x2, x8, xzr \n\t"
"madd x3, x3, x8, xzr \n\t"
"madd x4, x4, x8, xzr \n\t"
// Loop constants.
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
// A stored in columns.
" .ACOLSTOR: \n\t"
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
#ifdef _A64FX
// Disable hardware prefetch for A.
"mov x16, 0x6 \n\t"
"lsl x16, x16, #60 \n\t"
"orr x0, x0, x16 \n\t"
#endif
" .ACOLSTORMKER: \n\t"
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ldr q1, [x0, #64] \n\t"
"ld1d z2.d, p0/z, [x5] \n\t"
"ldr q3, [x5, #64] \n\t"
"ld1d z4.d, p0/z, [x6] \n\t"
"ldr q5, [x6, #64] \n\t"
"ld1d z6.d, p0/z, [x7] \n\t"
"ldr q7, [x7, #64] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x0, x7, x3 \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z8.d, p0/z, [x0] \n\t"
"ldr q9, [x0, #64] \n\t"
"ld1d z10.d, p0/z, [x5] \n\t"
"ldr q11, [x5, #64] \n\t"
"ld1d z12.d, p0/z, [x6] \n\t"
"ldr q13, [x6, #64] \n\t"
"ld1d z14.d, p0/z, [x7] \n\t"
"ldr q15, [x7, #64] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
// Plain storage
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"st1d z2.d, p0, [x10] \n\t"
"str q3, [x10, #64] \n\t"
"st1d z4.d, p0, [x11] \n\t"
"str q5, [x11, #64] \n\t"
"st1d z6.d, p0, [x12] \n\t"
"str q7, [x12, #64] \n\t"
"st1d z8.d, p0, [x13] \n\t"
"str q9, [x13, #64] \n\t"
"st1d z10.d, p0, [x14] \n\t"
"str q11, [x14, #64] \n\t"
"st1d z12.d, p0, [x15] \n\t"
"str q13, [x15, #64] \n\t"
"st1d z14.d, p0, [x16] \n\t"
"str q15, [x16, #64] \n\t"
"add x1, x16, x2 \n\t"
// Realign and store.
// "ext z1.b, z1.b, z1.b, #16 \n\t"
// "ext z1.b, z1.b, z2.b, #48 \n\t"
// "ext z2.b, z2.b, z3.b, #16 \n\t"
// "ext z2.b, z2.b, z4.b, #32 \n\t"
// "ext z4.b, z4.b, z5.b, #16 \n\t"
// "ext z4.b, z4.b, z6.b, #16 \n\t"
// "ext z6.b, z6.b, z7.b, #16 \n\t"
// "ext z9.b, z9.b, z9.b, #16 \n\t"
// "ext z9.b, z9.b, z10.b, #48 \n\t"
// "ext z10.b, z10.b, z11.b, #16 \n\t"
// "ext z10.b, z10.b, z12.b, #32 \n\t"
// "ext z12.b, z12.b, z13.b, #16 \n\t"
// "ext z12.b, z12.b, z14.b, #16 \n\t"
// "ext z14.b, z14.b, z15.b, #16 \n\t"
// "st1d z0.d, p0, [x1] \n\t"
// "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
// "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
// "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
// "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
// "add x1, x1, #320 \n\t"
// "st1d z8.d, p0, [x1] \n\t"
// "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
// "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
// "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
// "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
// "add x1, x1, #320 \n\t"
"add x0, x7, x3 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ldr q1, [x0, #64] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
// A stored in rows.
" .AROWSTOR: \n\t"
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"add x17, x16, x4 \n\t"
"add x18, x17, x4 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x10] \n\t"
"ld1d z2.d, p0/z, [x11] \n\t"
"ld1d z3.d, p0/z, [x12] \n\t"
"ld1d z4.d, p0/z, [x13] \n\t"
"ld1d z5.d, p0/z, [x14] \n\t"
"ld1d z6.d, p0/z, [x15] \n\t"
"ld1d z7.d, p0/z, [x16] \n\t"
"ld1d z22.d, p0/z, [x17] \n\t"
"ld1d z23.d, p0/z, [x18] \n\t"
// Transpose first 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
// Transpose last 2 rows.
SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
// Plain storage.
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z8.d, p0, [x1] \n\t"
"str q16, [x1, #64] \n\t"
"st1d z9.d, p0, [x10] \n\t"
"str q17, [x10, #64] \n\t"
"st1d z10.d, p0, [x11] \n\t"
"str q18, [x11, #64] \n\t"
"st1d z11.d, p0, [x12] \n\t"
"str q19, [x12, #64] \n\t"
"st1d z12.d, p0, [x13] \n\t"
"str q20, [x13, #64] \n\t"
"st1d z13.d, p0, [x14] \n\t"
"str q21, [x14, #64] \n\t"
"st1d z14.d, p0, [x15] \n\t"
"str q22, [x15, #64] \n\t"
"st1d z15.d, p0, [x16] \n\t"
"str q23, [x16, #64] \n\t"
"add x1, x16, x2 \n\t"
"add x0, x0, #64 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"add x6, x0, x5 \n\t"
"add x7, x6, x4 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
"ldr d1, [x6] \n\t"
"ldr d2, [x7] \n\t"
"trn1 v1.2d, v1.2d, v2.2d \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
"mov x0, #0 \n\t"
:
: [a] "r" (a),
[p] "r" (p),
[lda] "r" (lda),
[ldp] "r" (ldp),
[inca] "r" (inca),
[n_mker] "r" (n_mker),
[n_left] "r" (n_left)
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x8", "x9", "x10","x11","x12","x13","x14","x15",
"x16","x17","x18",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19","z20","z21","z22","z23",
// "z24","z25","z26","z27","z28","z29",
"z30","z31",
"p0", "p1", "p2", "p3", "p4", // "p5",
"p6", "p7", "p8"
);
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,359 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Linaro Limited
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <stdio.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#else
#error "No Arm SVE intrinsics support in compiler"
#endif // __ARM_FEATURE_SVE
// assumption:
// SVE vector length = 512 bits.
// TODO:
// 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
// prefetching is needed.
void bli_dpackm_armsve512_asm_12xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 12;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
double* restrict alpha1 = a;
double* restrict alpha1_8 = alpha1 + 8 * inca;
double* restrict alpha1_p4 = alpha1 + 4 * inca;
double* restrict alpha1_m4 = alpha1 - 4 * inca;
double* restrict pi1 = p;
const svbool_t all_active = svptrue_b64();
const svbool_t first_half_active = svwhilelt_b64(0, 4);
const svbool_t last_half_active = svnot_z(all_active, first_half_active);
svfloat64_t z_a0;
svfloat64_t z_a8;
svfloat64_t z_a8_lh;
svfloat64_t z_a16;
svuint64_t z_index;
// creating index for gather/scatter
// with each element as: 0, 1*inca, 2*inca, 3*inca
z_index = svindex_u64( 0, inca * sizeof( double ) );
if ( cdim == mnr )
{
if ( bli_deq1( *kappa ) )
{
if ( inca == 1 ) // continous memory. packA style
{
dim_t k = n;
// 2 pack into 3 case.
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// load 12 continuous elments from *a, filling last half of z8.
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_f64( all_active, alpha1_p4 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
// line-by-line packing case.
for ( ; k != 0; --k )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// store them into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
else // gather/scatter load/store. packB style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// gather load from *a, filling last half of z8.
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// scatter store into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
}
else // *kappa != 1.0
{
// load kappa into vector
svfloat64_t z_kappa;
z_kappa = svdup_f64( *kappa );
if ( inca == 1 ) // continous memory. packA style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// load 12 continuous elments from *a, filling last half of z8.
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_f64( all_active, alpha1_p4 );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
// store them into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
else // gather/scatter load/store. packB style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// gather load from *a, filling last half of z8.
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
// scatter store into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
} // end of if ( *kappa == 1.0 )
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,363 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
// assumption:
// SVE vector length = 512 bits.
void bli_dpackm_armsve512_asm_16xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 16;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
const bool gs = inca != 1 && lda != 1;
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif
if ( cdim == mnr && !gs && unitk )
{
uint64_t n_mker = n / 8;
uint64_t n_left = n % 8;
__asm__ volatile (
"mov x0, %[a] \n\t"
"mov x1, %[p] \n\t"
"mov x2, %[ldp] \n\t"
"mov x3, %[lda] \n\t"
"mov x4, %[inca] \n\t"
"cmp x4, #1 \n\t"
// Skips by sizeof(double).
"mov x8, #8 \n\t"
"madd x2, x2, x8, xzr \n\t"
"madd x3, x3, x8, xzr \n\t"
"madd x4, x4, x8, xzr \n\t"
// "mov x8, 0x8 \n\t" // Control#0 for A address.
// "mov x8, 0x24 \n\t" // Higher 6bit for Control#0:
// "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
// "orr x8, x8, x3 \n\t" // Stride.
// "msr S3_3_C11_C6_0, x8 \n\t" // Write system register.
// Loop constants.
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
// A stored in columns.
" .ACOLSTOR: \n\t"
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
#ifdef _A64FX
"mov x16, 0x6 \n\t" // Disable hardware prefetch for A.
"lsl x16, x16, #60 \n\t"
"orr x0, x0, x16 \n\t"
#endif
// "add x5, x0, x3 \n\t"
// "add x6, x5, x3 \n\t"
// "add x7, x6, x3 \n\t"
// "prfm PLDL1STRM, [x0] \n\t"
// "prfm PLDL1STRM, [x5] \n\t"
// "prfm PLDL1STRM, [x6] \n\t"
// "prfm PLDL1STRM, [x7] \n\t"
// "add x18, x7, x3 \n\t"
// "add x5, x18, x3 \n\t"
// "add x6, x5, x3 \n\t"
// "add x7, x6, x3 \n\t"
// "prfm PLDL1STRM, [x18] \n\t"
// "prfm PLDL1STRM, [x5] \n\t"
// "prfm PLDL1STRM, [x6] \n\t"
// "prfm PLDL1STRM, [x7] \n\t"
" .ACOLSTORMKER: \n\t"
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
"ld1d z2.d, p0/z, [x5] \n\t"
"ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
"ld1d z4.d, p0/z, [x6] \n\t"
"ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
"ld1d z6.d, p0/z, [x7] \n\t"
"ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x0, x7, x3 \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z8.d, p0/z, [x0] \n\t"
"ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
"ld1d z10.d, p0/z, [x5] \n\t"
"ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
"ld1d z12.d, p0/z, [x6] \n\t"
"ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
"ld1d z14.d, p0/z, [x7] \n\t"
"ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"st1d z2.d, p0, [x10] \n\t"
"st1d z3.d, p0, [x10, #1, mul vl] \n\t"
"st1d z4.d, p0, [x11] \n\t"
"st1d z5.d, p0, [x11, #1, mul vl] \n\t"
"st1d z6.d, p0, [x12] \n\t"
"st1d z7.d, p0, [x12, #1, mul vl] \n\t"
"st1d z8.d, p0, [x13] \n\t"
"st1d z9.d, p0, [x13, #1, mul vl] \n\t"
"st1d z10.d, p0, [x14] \n\t"
"st1d z11.d, p0, [x14, #1, mul vl] \n\t"
"st1d z12.d, p0, [x15] \n\t"
"st1d z13.d, p0, [x15, #1, mul vl] \n\t"
"st1d z14.d, p0, [x16] \n\t"
"st1d z15.d, p0, [x16, #1, mul vl] \n\t"
"add x0, x7, x3 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
// A stored in rows.
" .AROWSTOR: \n\t"
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x10] \n\t"
"ld1d z2.d, p0/z, [x11] \n\t"
"ld1d z3.d, p0/z, [x12] \n\t"
"ld1d z4.d, p0/z, [x13] \n\t"
"ld1d z5.d, p0/z, [x14] \n\t"
"ld1d z6.d, p0/z, [x15] \n\t"
"ld1d z7.d, p0/z, [x16] \n\t"
"add x5, x16, x4 \n\t"
"add x10, x5, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"ld1d z16.d, p0/z, [x5] \n\t"
"ld1d z17.d, p0/z, [x10] \n\t"
"ld1d z18.d, p0/z, [x11] \n\t"
"ld1d z19.d, p0/z, [x12] \n\t"
"ld1d z20.d, p0/z, [x13] \n\t"
"ld1d z21.d, p0/z, [x14] \n\t"
"ld1d z22.d, p0/z, [x15] \n\t"
"ld1d z23.d, p0/z, [x16] \n\t"
// Transpose first 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
// Transpose last 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z8.d, p0, [x1] \n\t"
"st1d z24.d, p0, [x1, #1, mul vl] \n\t"
"st1d z9.d, p0, [x10] \n\t"
"st1d z25.d, p0, [x10, #1, mul vl] \n\t"
"st1d z10.d, p0, [x11] \n\t"
"st1d z26.d, p0, [x11, #1, mul vl] \n\t"
"st1d z11.d, p0, [x12] \n\t"
"st1d z27.d, p0, [x12, #1, mul vl] \n\t"
"st1d z12.d, p0, [x13] \n\t"
"st1d z28.d, p0, [x13, #1, mul vl] \n\t"
"st1d z13.d, p0, [x14] \n\t"
"st1d z29.d, p0, [x14, #1, mul vl] \n\t"
"st1d z14.d, p0, [x15] \n\t"
"st1d z30.d, p0, [x15, #1, mul vl] \n\t"
"st1d z15.d, p0, [x16] \n\t"
"st1d z31.d, p0, [x16, #1, mul vl] \n\t"
"add x0, x0, #64 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"add x6, x0, x5 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
"ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
"mov x0, #0 \n\t"
:
: [a] "r" (a),
[p] "r" (p),
[lda] "r" (lda),
[ldp] "r" (ldp),
[inca] "r" (inca),
[n_mker] "r" (n_mker),
[n_left] "r" (n_left)
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x8", "x9", "x10","x11","x12","x13","x14","x15",
"x16","x17","x18",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10","z11","z12","z13","z14","z15",
// "z16","z17","z18","z19","z20","z21","z22","z23",
// "z24","z25","z26","z27","z28","z29","z30","z31",
"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
);
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,191 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
\
GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
// Second through forth microkernels are the first one with B vectors rotated.
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
// NOTE:
// The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
// (sth. akin to loop-invariant):
// - BV[0-7] holds B[0:7, 4*k_cur]
// - B's address stops at B[0, 4*k_cur+1]
// Final loop inside K=4 microkernels.
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
// K=4 MKer loop with B memory scattered.
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
" mov "#BELMADDR", "#BADDR" \n\t" \
GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
\
GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
" mov "#BELMADDR", "#BADDR" \n\t" \
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
CLEAR_COL4(Z00,Z01,Z02,Z03) \
CLEAR_COL4(Z04,Z05,Z06,Z07) \
CLEAR_COL4(Z08,Z09,Z10,Z11) \
CLEAR_COL4(Z12,Z13,Z14,Z15) \
CLEAR_COL4(Z16,Z17,Z18,Z19)
#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)

View File

@@ -0,0 +1,123 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define CLEAR_COL2(Z0,Z1) \
" dup "#Z0"."DT", #0 \n\t" \
" dup "#Z1"."DT", #0 \n\t"
#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
CLEAR_COL2(Z0,Z1) \
CLEAR_COL2(Z2,Z3)
#define SCALE_COL2(Z0,Z1,ZFACTOR) \
" fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
" fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
SCALE_COL2(Z0,Z1,ZFACTOR) \
SCALE_COL2(Z2,Z3,ZFACTOR)
// Prefetch or not.
#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
" prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
" fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
" add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
// Prefetch or not.
#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
" add "#ATEMP", "#AADDR", "#APS" \n\t" \
GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \
" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
" fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
" fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use double precision.
#define DT "d"
#define LD1 "ld1d"
#define ST1 "st1d"
#define LD1R "ld1rd"
#define PRFG "prfd"
#define SZ "8"
#define OFFS "lsl #3"
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use half precision.
#define DT "h"
#define LD1 "ld1h"
#define ST1 "st1h"
#define LD1R "ld1rh"
#define PRFG "prfh"
#define SZ "2"
// #define OFFS UNSUPPORTED
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use single precision.
#define DT "s"
#define LD1 "ld1w"
#define ST1 "st1w"
#define LD1R "ld1rw"
#define PRFG "prfw"
#define SZ "4"
#define OFFS "uxtw #2"
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,318 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Double-precision composite instructions.
#include "armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
void bli_dgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" incd x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.d \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
" ld1rd z28.d, p0/z, [x1, 64] \n\t"
" ld1rd z29.d, p0/z, [x1, 72] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ldr x4, [x4] \n\t" // Load alpha & beta (value).
" ldr x8, [x8] \n\t"
" dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.d, x8 \n\t"
" fmov d28, #1.0 \n\t" // Prepare FP 1.0.
" fmov x16, d28 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
#ifdef _A64FX
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" prfm PLDL1STRM, [x0] \n\t"
" prfm PLDL1STRM, [x0, 256*1] \n\t"
// " prfm PLDL2KEEP, [x0, 256*2] \n\t"
// " prfm PLDL2KEEP, [x0, 256*3] \n\t"
// " prfm PLDL2KEEP, [x0, 256*4] \n\t"
// " prfm PLDL2KEEP, [x0, 256*5] \n\t"
// " prfm PLDL2KEEP, [x0, 256*6] \n\t"
// " prfm PLDL2KEEP, [x0, 256*7] \n\t"
// " prfm PLDL2KEEP, [x0, 256*8] \n\t"
// " prfm PLDL2KEEP, [x0, 256*9] \n\t"
// " prfm PLDL2KEEP, [x0, 256*10] \n\t"
// " prfm PLDL2KEEP, [x0, 256*11] \n\t"
// " prfm PLDL2KEEP, [x0, 256*12] \n\t"
// " prfm PLDL2KEEP, [x0, 256*13] \n\t"
// " prfm PLDL2KEEP, [x0, 256*14] \n\t"
// " prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL1STRM, [x1] \n\t"
" prfm PLDL1STRM, [x1, 256*1] \n\t"
// " prfm PLDL2KEEP, [x1, 256*2] \n\t"
// " prfm PLDL2KEEP, [x1, 256*3] \n\t"
// " prfm PLDL2KEEP, [x1, 256*4] \n\t"
// " prfm PLDL2KEEP, [x1, 256*5] \n\t"
// " prfm PLDL2KEEP, [x1, 256*6] \n\t"
// " prfm PLDL2KEEP, [x1, 256*7] \n\t"
// " prfm PLDL2KEEP, [x1, 256*8] \n\t"
// " prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t" // Preload first half of C for contiguous case.
" b.ne WRITE_MEM \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" cmp x16, x4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
// First half of C is already loaded in this case.
GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,307 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Single-precision composite instructions.
#include "armsve_asm_macros_single.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
void bli_sgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" incw x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #4 \n\t" // Multiply some address skips by sizeof(float).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.s \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
" ld1rw z28.s, p0/z, [x1, 32] \n\t"
" ld1rw z29.s, p0/z, [x1, 36] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ldr w4, [x4] \n\t" // Load alpha & beta (value).
" ldr w8, [x8] \n\t"
" dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.s, w8 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
" prfm PLDL2KEEP, [x0] \n\t"
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov s28, #1.0 \n\t"
" fmov w16, s28 \n\t"
" cmp w16, w4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,343 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Half-precision composite instructions.
#include "armsve_asm_macros_half.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
// Gather-load / scatter-store instruction for half-precision
// needs being defined separately.
#undef GEMM_CCOL_GATHER_LOAD_FWD
#undef GEMM_CCOL_SCATTER_STORE_FWD
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
" add x28, "#CADDR", "#CRS2" \n\t" \
" ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
" ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
" fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" add x28, "#CTEMP", "#CRS2" \n\t" \
" ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
" ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
" fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
" add x28, "#CADDR", "#CRS2" \n\t" \
" st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
" st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" add x28, "#CTEMP", "#CRS2" \n\t" \
" st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
" st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
void bli_shgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
void* restrict alpha,
void* restrict a,
void* restrict b,
void* restrict beta,
void* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" inch x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.b \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
" ld1rh z28.h, p0/z, [x1, 16] \n\t"
" ld1rh z29.h, p0/z, [x1, 18] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors.
" ld1rh z31.h, p0/z, [x8] \n\t"
" fmov w4, h28 \n\t" // Copy alpha & beta to GP registers.
" fmov w8, h29 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
" prfm PLDL2KEEP, [x0] \n\t"
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov h28, #1.0 \n\t"
" fmov w16, h28 \n\t"
" cmp w16, w4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x10, xzr \n\t"
" incb x10 \n\t"
" madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip.
" mov x28, #2 \n\t"
" madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case.
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
" \n\t"
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16","x10","x28",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,450 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Separate instantiation for ArmSVE reference kernels.
// Temporary workaround. Will be removed after upstream has switched to a better way
// of exposing gemmsup interface.
//
// -- Row storage case ---------------------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
//
// -- Column storage case ------------------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )

View File

@@ -0,0 +1,528 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
// Double-precision composite instructions.
#include "../armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "../armsve_asm_2vx10.h"
// Prototype reference kernel.
GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 )
void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
(
conj_t conja,
conj_t conjb,
dim_t m0,
dim_t n0,
dim_t k0,
double* restrict alpha,
double* restrict a, inc_t rs_a0, inc_t cs_a0,
double* restrict b, inc_t rs_b0, inc_t cs_b0,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
static int called = 0;
if ( !called )
{
fprintf(stderr, "rv called.\n");
called = 1;
}
// c*c requires A to be stored in columns.
assert( rs_a0 == 1 );
dim_t n0_mker = n0 / 10;
dim_t n0_left = n0 % 10;
if ( n0_left )
{
// A[:, ::]
// B[::, n0_mker*10:n0]
// C[: , n0_mker*10:n0]
double *ai = a;
double *bi = b + n0_mker * 10 * cs_b0;
double *ci = c + n0_mker * 10 * cs_c0;
bli_dgemmsup_c_armsve_ref2
(
conja, conjb,
m0, n0_left, k0,
alpha,
ai, rs_a0, cs_a0,
bi, rs_b0, cs_b0,
beta,
ci, rs_c0, cs_c0,
data,
cntx
);
}
// Return if it's a pure edge case.
if ( !n0_mker )
return;
// Determine VL.
uint64_t vlen2;
__asm__ (
" mov x0, xzr \n\t"
" incd x0, ALL, MUL #2 \n\t"
" mov %[vlen2], x0 \n\t"
: [vlen2] "=r" (vlen2)
:
: "x0"
);
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// uint64_t rs_a = 1;
uint64_t cs_a = cs_a0;
uint64_t rs_b = rs_b0;
uint64_t cs_b = cs_b0;
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t n_mker = n0_mker;
dim_t m0_mker = m0 / vlen2;
dim_t m0_left = m0 % vlen2;
if ( m0_left )
{
// Edge case on A side can be handled with one more (predicated) loop.
m0_mker++;
} else
m0_left = vlen2;
// uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_b = bli_auxinfo_ps_b( data );
for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
{
uint64_t m_curr = vlen2;
if ( im0_mker == m0_mker - 1 )
{
// Last m-loop. Maybe unnecessary.
m_curr = m0_left;
}
double *ai = a + im0_mker * vlen2 * rs_a0;
double *bi = b;
double *ci = c + im0_mker * vlen2 * rs_c0;
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
__asm__ volatile (
" ldr x0, %[bi] \n\t"
" ldr x1, %[rs_b] \n\t" // Row-skip of B.
" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]).
" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B.
" ldr x4, %[cs_a] \n\t" // Column-Skip of A.
" \n\t" // Element skip of A[:, l] is guaranteed to be 1.
" ldr x5, %[ci] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x16, 0x1 \n\t" // Tag C address.
" lsl x16, x16, #56 \n\t"
" orr x5, x5, x16 \n\t"
" mov x16, 0x2 \n\t" // Tag B address.
" lsl x16, x16, #56 \n\t"
" orr x0, x0, x16 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x1, x8, x1, xzr \n\t" // rs_b
" madd x2, x8, x2, xzr \n\t" // cs_b
" madd x3, x8, x3, xzr \n\t" // ps_b
" madd x4, x8, x4, xzr \n\t" // cs_a
" madd x7, x8, x7, xzr \n\t" // cs_c
" mov x8, #4 \n\t"
" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A.
" \n\t"
#ifdef _A64FX
" mov x16, 0x20 \n\t" // Higher 6bit for Control#2:
" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
" orr x16, x16, x4 \n\t" // Stride.
" msr S3_3_C11_C6_2, x16 \n\t" // Write system register.
#endif
" \n\t"
" ldr x8, %[m_curr] \n\t" // Size of first dimension.
" mov x9, xzr \n\t"
" incd x9 \n\t"
" ptrue p0.d \n\t"
" whilelo p1.d, xzr, x8 \n\t"
" whilelo p2.d, x9, x8 \n\t"
" \n\t"
" ldr x8, %[n_mker] \n\t" // Number of N-loops.
" \n\t"
" ldr x20, %[ai] \n\t" // Parameters to be reloaded
" ldr x21, %[k_mker] \n\t" // within each millikernel loop.
" ldr x22, %[k_left] \n\t"
" ldr x23, %[alpha] \n\t"
" ldr x24, %[beta] \n\t"
" ldr x25, %[a_next] \n\t"
" ldr x26, %[b_next] \n\t"
" ldr x23, [x23] \n\t" // Directly load alpha and beta.
" ldr x24, [x24] \n\t"
" \n\t"
" MILLIKER_MLOOP: \n\t"
" \n\t"
" mov x11, x0 \n\t" // B's address.
// " ldr x10, %[ai] \n\t" // A's address.
" mov x10, x20 \n\t"
// " ldr x12, %[k_mker] \n\t"
" mov x12, x21 \n\t"
// " ldr x13, %[k_left] \n\t"
" mov x13, x22 \n\t"
#ifdef _A64FX
" mov x16, 0x3 \n\t" // Tag A address.
" lsl x16, x16, #56 \n\t"
" orr x10, x10, x16 \n\t"
" mov x16, 0xa \n\t" // Control#2 for A address.
" lsl x16, x16, #60 \n\t"
" orr x10, x10, x16 \n\t"
#endif
" \n\t"
" cmp x12, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" \n\t"
" mov x14, x11 \n\t"
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row.
" add x14, x14, x2 \n\t"
" ld1rd z21.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z22.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z23.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z24.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z25.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z26.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z27.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left.
" add x14, x14, x2 \n\t"
" prfm PLDL1KEEP, [x14] \n\t"
" sub x14, x14, x2 \n\t" // Restore x14 to load edge.
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A.
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" add x10, x10, x4 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x13, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
" mov x14, x11 \n\t"
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B.
" add x14, x14, x2 \n\t"
" ld1rd z21.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z22.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z23.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z24.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z25.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z26.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z27.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z28.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z29.d, p0/z, [x14] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x10, x10, x4 \n\t" // Forward A.
" add x11, x11, x1 \n\t" // Forward B.
" sub x13, x13, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
// " ldr x10, %[ai] \n\t"
" mov x10, x20 \n\t"
" add x11, x0, x3 \n\t"
" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.d, x24 \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.eq PREFETCH_ABNEXT \n\t"
" prfm PLDL1STRM, [x10] \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" b WRITE_MEM \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
" mov x1, x25 \n\t"
// " ldr x2, %[b_next] \n\t"
" mov x2, x26 \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
" prfm PLDL2KEEP, [x2] \n\t"
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov d28, #1.0 \n\t"
" fmov x16, d28 \n\t"
" cmp x16, x23 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
" incb x13 \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x12, xzr \n\t"
" incb x12 \n\t"
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" subs x8, x8, #1 \n\t"
" b.eq END_EXEC \n\t"
" \n\t" // Address of C already forwarded to next column.
" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel.
" b MILLIKER_MLOOP \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [bi] "m" (bi),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b] "m" (ps_b),
[cs_a] "m" (cs_a),
[ci] "m" (ci),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[m_curr] "m" (m_curr),
[n_mker] "m" (n_mker),
[ai] "m" (ai),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x10","x11","x12","x13","x14","x15","x16","x17",
"x20","x21","x22","x23","x24","x25","x26",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}
}
void bli_dgemmsup_rv_armsve_10x2v_unindexed
(
conj_t conjat,
conj_t conjbt,
dim_t m0t,
dim_t n0t,
dim_t k0,
double* restrict alpha,
double* restrict at, inc_t rs_at0, inc_t cs_at0,
double* restrict bt, inc_t rs_bt0, inc_t cs_bt0,
double* restrict beta,
double* restrict ct, inc_t rs_ct0, inc_t cs_ct0,
auxinfo_t* restrict datat,
cntx_t* restrict cntx
)
{
auxinfo_t data;
bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
bli_dgemmsup_cv_armsve_2vx10_unindexed
(
conjbt, conjat,
n0t, m0t, k0,
alpha,
bt, cs_bt0, rs_bt0,
at, cs_at0, rs_at0,
beta,
ct, cs_ct0, rs_ct0,
&data,
cntx
);
}

View File

@@ -0,0 +1,412 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
// Double-precision composite instructions.
#include "../armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "../armsve_asm_2vx10.h"
// Prototype reference kernel.
GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 )
void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
(
conj_t conja,
conj_t conjb,
dim_t m0,
dim_t n0,
dim_t k0,
double* restrict alpha,
double* restrict a, inc_t rs_a0, inc_t cs_a0,
double* restrict b, inc_t rs_b0, inc_t cs_b0,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
static int called = 0;
if ( !called )
{
fprintf(stderr, "rv called.\n");
called = 1;
}
// r*r requires B to be stored in rows.
assert(cs_b0 == 1);
dim_t n0_mker = n0 / 10;
dim_t n0_left = n0 % 10;
if ( n0_left )
{
// A[:, ::]
// B[::, n0_mker*10:n0]
// C[: , n0_mker*10:n0]
double *ai = a;
double *bi = b + n0_mker * 10 * cs_b0;
double *ci = c + n0_mker * 10 * cs_c0;
bli_dgemmsup_r_armsve_ref2
(
conja, conjb,
m0, n0_left, k0,
alpha,
ai, rs_a0, cs_a0,
bi, rs_b0, cs_b0,
beta,
ci, rs_c0, cs_c0,
data,
cntx
);
}
// Return if it's a pure edge case.
if ( !n0_mker )
return;
// Determine VL.
uint64_t vlen2;
__asm__ (
" mov x0, xzr \n\t"
" incd x0, ALL, MUL #2 \n\t"
" mov %[vlen2], x0 \n\t"
: [vlen2] "=r" (vlen2)
:
: "x0"
);
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
uint64_t rs_b = rs_b0;
// uint64_t cs_b = 1;
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t m_mker = m0 / vlen2;
uint64_t m_left = m0 % vlen2;
if ( m_left )
{
// Edge case on A side can be handled with one more (predicated) loop.
m_mker++;
} else
m_left = vlen2;
uint64_t ps_a = bli_auxinfo_ps_a( data );
// uint64_t ps_b = bli_auxinfo_ps_b( data );
for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
{
double *ai = a;
double *bi = b + in0_mker * 10 * cs_b0;
double *ci = c + in0_mker * 10 * cs_c0;
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
__asm__ volatile (
" ldr x0, %[ai] \n\t"
" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]).
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A.
" ldr x4, %[rs_b] \n\t" // Row-Skip of B.
" \n\t" // Element skip of B[l, :] is guaranteed to be 1.
" ldr x5, %[ci] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x16, 0x1 \n\t" // Tag C address.
" lsl x16, x16, #56 \n\t"
" orr x5, x5, x16 \n\t"
" mov x16, 0x2 \n\t" // Tag A address.
" lsl x16, x16, #56 \n\t"
" orr x0, x0, x16 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // ps_a
" madd x4, x8, x4, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip.
" mov x8, #4 \n\t"
" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A.
// " mov x8, #4 \n\t"
// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B.
" \n\t"
" ldr x8, %[m_mker] \n\t" // Number of M-loops.
" ptrue p0.d \n\t"
" ptrue p1.d \n\t"
" ptrue p2.d \n\t"
" \n\t"
" MILLIKER_MLOOP: \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.ne UKER_BEGIN \n\t"
" \n\t"
" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop.
" mov x11, xzr \n\t"
" incd x11 \n\t"
" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2.
" whilelo p2.d, x11, x10 \n\t"
" \n\t"
" UKER_BEGIN: \n\t"
" mov x10, x0 \n\t" // A's address.
" ldr x11, %[bi] \n\t" // B's address.
" ldr x12, %[k_mker] \n\t"
" ldr x13, %[k_left] \n\t"
#ifdef _A64FX
" mov x16, 0x3 \n\t" // Tag B address.
" lsl x16, x16, #56 \n\t"
" orr x11, x11, x16 \n\t"
#endif
" \n\t"
" mov x16, x11 \n\t" // Prefetch first kernel of B.
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" \n\t"
" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row.
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
" \n\t"
" index z29.d, xzr, x1 \n\t" // First A column.
" \n\t" // Skips passed to index is not multiplied by 8.
GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t" // Unroll the 4-loop.
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" index z29.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" index z29.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" add x10, x10, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x13, #0 \n\t"
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
" ld1rd z20.d, p0/z, [x11] \n\t"
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
" ld1rd z28.d, p0/z, [x11, #64] \n\t"
" ld1rd z29.d, p0/z, [x11, #72] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x10, x10, x2 \n\t" // Forward A.
" add x11, x11, x4 \n\t" // Forward B.
" sub x13, x13, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x11, %[bi] \n\t"
" ldr x12, %[alpha] \n\t" // Load alpha & beta.
" ldr x13, %[beta] \n\t"
" ld1rd z30.d, p0/z, [x12] \n\t"
" ld1rd z31.d, p0/z, [x13] \n\t"
" ldr x12, [x12] \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.eq PREFETCH_ABNEXT \n\t"
" prfm PLDL2STRM, [x11] \n\t"
" b WRITE_MEM \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
" ldr x2, %[b_next] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
" prfm PLDL2KEEP, [x2] \n\t"
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov d28, #1.0 \n\t"
" fmov x16, d28 \n\t"
" cmp x16, x12 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" mov x10, x5 \n\t" // C address for storing.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
" incb x13 \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x12, xzr \n\t"
" incb x12 \n\t"
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" subs x8, x8, #1 \n\t"
" b.eq END_EXEC \n\t"
" \n\t"
" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel.
" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel.
" add x5, x5, x13 \n\t"
" b MILLIKER_MLOOP \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [ai] "m" (ai),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a] "m" (ps_a),
[rs_b] "m" (rs_b),
[ci] "m" (ci),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[m_mker] "m" (m_mker),
[m_left] "m" (m_left),
[bi] "m" (bi),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}
}

View File

@@ -33,5 +33,13 @@
*/
GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 )
GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed )
GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed )
PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_12xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk )

View File

@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk
mov(var(kappa), rcx) // load address of kappa
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
// now branch on kappa == 1.0

View File

@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk
mov(var(kappa), rcx) // load address of kappa
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
// now branch on kappa == 1.0

View File

@@ -0,0 +1,88 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
// entry point to any sandbox implementation.
// NOTE: This function is implemented identically to the function that it
// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
// forgoing the option of customizing the implementations that underlie
// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
// directory, however, will be included in the BLIS.
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, cname, imeth ) \
\
void PASTEMAC(opname,imeth) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
\
/* A switch to easily toggle whether we use the sandbox implementation
of bls_gemm() as the implementation for bli_gemm(). (This allows for
easy testing of bls_gemm() via the testsuite.) */ \
if ( 1 ) \
{ \
bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); \
return; \
} \
\
bli_init_once(); \
\
/* Obtain a valid (native) context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
( \
alpha, a, b, beta, c, cntx, rntm, NULL \
); \
}
GENFRONT( gemm, gemm, nat )

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of copyright holder(s) nor the names
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SANDBOX_H
#define BLIS_SANDBOX_H
// NOTE: This header is the only header required to be present in the sandbox
// implementation directory.
// This header should contain (or #include) any definitions that must be
// folded into blis.h. Typically, it will remain empty since any header
// definitions specific to the sandbox implementation will not need to be
// made available to applications (or the framework) during compilation.
#include "bls_gemm.h"
#include "bls_gemm_var.h"
#include "bls_l3_packm_a.h"
#include "bls_l3_packm_b.h"
#include "bls_l3_packm_var.h"
#include "bls_l3_decor.h"
#endif

304
sandbox/gemmlike/bls_gemm.c Normal file
View File

@@ -0,0 +1,304 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// -- Define the gemm-like operation's object API ------------------------------
//
void bls_gemm
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c
)
{
bls_gemm_ex
(
alpha,
a,
b,
beta,
c,
NULL,
NULL
);
}
void bls_gemm_ex
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// -- bli_gemmnat() --------------------------------------------------------
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// -- bli_gemm_front() -----------------------------------------------------
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
{
bli_gemm_check( alpha, a, b, beta, c, cntx );
}
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Induce a transposition of A if it has its transposition property set.
// Then clear the transposition bit in the object.
if ( bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
// Induce a transposition of B if it has its transposition property set.
// Then clear the transposition bit in the object.
if ( bli_obj_has_trans( &b_local ) )
{
bli_obj_induce_trans( &b_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
}
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
// NOTE: This is probably not needed within the sandbox.
// We must also swap the pack schemas, which were set by bli_gemm_md()
// or the inlined code above.
//bli_obj_swap_pack_schemas( &a_local, &b_local );
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
// point function for each thread. This also begins the process of creating
// the thrinfo_t tree, which contains thread communicators.
bls_l3_thread_decorator
(
bls_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm
);
}
//
// -- Define the gemm-like operation's thread entry point ----------------------
//
void bls_gemm_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
// In this function, we choose the gemm implementation that is executed
// on each thread.
#if 1
// Call the block-panel algorithm that calls the kernel directly, which
// exposes edge-case handling.
bls_gemm_bp_var1
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
thread
);
#else
// Call the block-panel algorithm that calls the kernel indirectly via a
// wrapper function, which hides edge-case handling.
bls_gemm_bp_var2
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
thread
);
#endif
}
//
// -- Define the gemm-like operation's typed API -------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
bli_init_once(); \
\
/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
the macro parameter 'ch' (e.g. s, d, etc). */ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, bo, betao, co; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
/* Adjust the dimensions of matrices A and B according to the transa and
transb parameters. */ \
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
\
/* Create bufferless scalar objects and attach the provided scalar pointers
to those scalar objects. */ \
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
\
/* Create bufferless matrix objects and attach the provided matrix pointers
to those matrix objects. */ \
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
\
/* Set the transposition/conjugation properties of the objects for matrices
A and B. */ \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
/* Call the object interface. */ \
PASTECH(bls_,opname) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co \
); \
}
//INSERT_GENTFUNC_BASIC0( gemm )
GENTFUNC( float, s, gemm )
GENTFUNC( double, d, gemm )
GENTFUNC( scomplex, c, gemm )
GENTFUNC( dcomplex, z, gemm )

101
sandbox/gemmlike/bls_gemm.h Normal file
View File

@@ -0,0 +1,101 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Prototype the gemm-like operation's object API ---------------------------
//
void bls_gemm
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c
);
void bls_gemm_ex
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
//
// -- Prototype the gemm-like operation's thread entry point -------------------
//
void bls_gemm_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
//
// -- Prototype the gemm-like operation's typed API ----------------------------
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
//INSERT_GENTPROT_BASIC0( gemm )
GENTPROT( float, s, gemm )
GENTPROT( double, d, gemm )
GENTPROT( scomplex, c, gemm )
GENTPROT( dcomplex, z, gemm )

View File

@@ -0,0 +1,521 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- gemm-like block-panel algorithm (object interface) -----------------------
//
// Define a function pointer array named ftypes and initialize its contents with
// the addresses of the typed functions defined below, bls_?gemm_bp_var1().
static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1);
void bls_gemm_bp_var1
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
void* restrict buf_a = bli_obj_buffer_at_off( a );
const inc_t rs_a = bli_obj_row_stride( a );
const inc_t cs_a = bli_obj_col_stride( a );
void* restrict buf_b = bli_obj_buffer_at_off( b );
const inc_t rs_b = bli_obj_row_stride( b );
const inc_t cs_b = bli_obj_col_stride( b );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Index into the function pointer array to extract the correct
// typed function pointer based on the chosen datatype.
FUNCPTR_T f = ftypes[dt];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread
);
}
//
// -- gemm-like block-panel algorithm (typed interface) ------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
const inc_t irstep_c = rs_c * MR; \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of the scalars to prevent any unnecessary sharing of
cache lines between the cores' caches. */ \
ctype alpha_local = *alpha_cast; \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
ctype zero_local = *PASTEMAC(ch,0); \
\
auxinfo_t aux; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. */ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
BLIS_KC, /* 4th loop */ \
BLIS_NO_PART, /* pack B */ \
BLIS_MC, /* 3rd loop */ \
BLIS_NO_PART, /* pack A */ \
BLIS_NR, /* 2nd loop */ \
BLIS_MR, /* 1st loop */ \
BLIS_KR }; /* microkernel loop */ \
\
bszid_t* restrict bszids_jc = &bszids[0]; \
bszid_t* restrict bszids_pc = &bszids[1]; \
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
bszid_t* restrict bszids_ic = &bszids[3]; \
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
bszid_t* restrict bszids_jr = &bszids[5]; \
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
thrinfo_t* restrict thread_ir = NULL; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
\
/* Determine the packing buffer and related parameters for matrix
B. Then call the packm implementation. */ \
PASTECH2(bls_,ch,packm_b) \
( \
conjb, \
KC, NC, \
kc_cur, nc_cur, NR, \
&one_local, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
\
/* Determine the packing buffer and related parameters for matrix
A. Then call the packm implementation. */ \
PASTECH2(bls_,ch,packm_a) \
( \
conja, \
MC, KC, \
mc_cur, kc_cur, MR, \
&one_local, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Query the number of threads and thread ids for the JR loop.
NOTE: These values are only needed when computing the next
micropanel of B. */ \
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur \
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Assume for now that our next panel of B to be the current panel
of B. */ \
ctype* restrict b2 = b_jr; \
\
/* Identify the current thrinfo_t node. */ \
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
\
/* Query the number of threads and thread ids for the IR loop.
NOTE: These values are only needed when computing the next
micropanel of A. */ \
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
\
/* Compute number of primary and leftover components of the IR loop. */ \
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
dim_t ir_left = mc_cur % MR; \
\
/* Compute the IR loop thread range for the current thread. */ \
dim_t ir_start, ir_end; \
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
{ \
const dim_t mr_cur \
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next micropanels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_ic_use; \
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_pc_use; \
} \
\
/* Save the addresses of next micropanels of A and B to the
auxinfo_t object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( mr_cur == MR && nr_cur == NR ) \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
&alpha_local, \
a_ir, \
b_jr, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
&alpha_local, \
a_ir, \
b_jr, \
&zero_local, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn) \
( \
mr_cur, \
nr_cur, \
ct, rs_ct, cs_ct, \
beta_use, \
c_ir, rs_c, cs_c \
); \
} \
} \
} \
} \
\
/* This barrier is needed to prevent threads from starting to pack
the next row panel of B before the current row panel is fully
computed upon. */ \
bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTECH2(bls_,ch,packm_finalize_mem_a) \
( \
rntm, \
&mem_a, \
thread_pa \
); \
PASTECH2(bls_,ch,packm_finalize_mem_b) \
( \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
*/ \
}
//INSERT_GENTFUNC_BASIC0( gemm_bp_var1 )
GENTFUNC( float, s, gemm_bp_var1 )
GENTFUNC( double, d, gemm_bp_var1 )
GENTFUNC( scomplex, c, gemm_bp_var1 )
GENTFUNC( dcomplex, z, gemm_bp_var1 )

View File

@@ -0,0 +1,596 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- gemm-like block-panel algorithm (object interface) -----------------------
//
// Define a function pointer array named ftypes and initialize its contents with
// the addresses of the typed functions defined below, bls_?gemm_bp_var2().
static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var2);
void bls_gemm_bp_var2
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
void* restrict buf_a = bli_obj_buffer_at_off( a );
const inc_t rs_a = bli_obj_row_stride( a );
const inc_t cs_a = bli_obj_col_stride( a );
void* restrict buf_b = bli_obj_buffer_at_off( b );
const inc_t rs_b = bli_obj_row_stride( b );
const inc_t cs_b = bli_obj_col_stride( b );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Index into the function pointer array to extract the correct
// typed function pointer based on the chosen datatype.
FUNCPTR_T f = ftypes[dt];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread
);
}
//
// -- gemm-like block-panel algorithm (typed interface) ------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
/*
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
*/ \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
const inc_t irstep_c = rs_c * MR; \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of the scalars to prevent any unnecessary sharing of
cache lines between the cores' caches. */ \
ctype alpha_local = *alpha_cast; \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
/*ctype zero_local = *PASTEMAC(ch,0);*/ \
\
auxinfo_t aux; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. */ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
BLIS_KC, /* 4th loop */ \
BLIS_NO_PART, /* pack B */ \
BLIS_MC, /* 3rd loop */ \
BLIS_NO_PART, /* pack A */ \
BLIS_NR, /* 2nd loop */ \
BLIS_MR, /* 1st loop */ \
BLIS_KR }; /* microkernel loop */ \
\
bszid_t* restrict bszids_jc = &bszids[0]; \
bszid_t* restrict bszids_pc = &bszids[1]; \
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
bszid_t* restrict bszids_ic = &bszids[3]; \
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
bszid_t* restrict bszids_jr = &bszids[5]; \
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
thrinfo_t* restrict thread_ir = NULL; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
\
/* Determine the packing buffer and related parameters for matrix
B. Then call the packm implementation. */ \
PASTECH2(bls_,ch,packm_b) \
( \
conjb, \
KC, NC, \
kc_cur, nc_cur, NR, \
&one_local, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
\
/* Determine the packing buffer and related parameters for matrix
A. Then call the packm implementation. */ \
PASTECH2(bls_,ch,packm_a) \
( \
conja, \
MC, KC, \
mc_cur, kc_cur, MR, \
&one_local, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Query the number of threads and thread ids for the JR loop.
NOTE: These values are only needed when computing the next
micropanel of B. */ \
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur \
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Assume for now that our next panel of B to be the current panel
of B. */ \
ctype* restrict b2 = b_jr; \
\
/* Identify the current thrinfo_t node. */ \
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
\
/* Query the number of threads and thread ids for the IR loop.
NOTE: These values are only needed when computing the next
micropanel of A. */ \
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
\
/* Compute number of primary and leftover components of the IR loop. */ \
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
dim_t ir_left = mc_cur % MR; \
\
/* Compute the IR loop thread range for the current thread. */ \
dim_t ir_start, ir_end; \
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
{ \
const dim_t mr_cur \
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next micropanels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_ic_use; \
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_pc_use; \
} \
\
/* Save the addresses of next micropanels of A and B to the
auxinfo_t object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Call a wrapper to the kernel (which handles edge cases). */ \
PASTECH2(bls_,ch,gemm_kernel) \
( \
MR, \
NR, \
mr_cur, \
nr_cur, \
kc_cur, \
&alpha_local, \
a_ir, rs_a_use, cs_a_use, \
b_jr, rs_b_use, cs_b_use, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* This barrier is needed to prevent threads from starting to pack
the next row panel of B before the current row panel is fully
computed upon. */ \
bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTECH2(bls_,ch,packm_finalize_mem_a) \
( \
rntm, \
&mem_a, \
thread_pa \
); \
PASTECH2(bls_,ch,packm_finalize_mem_b) \
( \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
*/ \
}
//INSERT_GENTFUNC_BASIC0( gemm_bp_var2 )
GENTFUNC( float, s, gemm_bp_var2 )
GENTFUNC( double, d, gemm_bp_var2 )
GENTFUNC( scomplex, c, gemm_bp_var2 )
GENTFUNC( dcomplex, z, gemm_bp_var2 )
//
// -- gemm-like microkernel wrapper --------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
const dim_t MR, \
const dim_t NR, \
dim_t mr_cur, \
dim_t nr_cur, \
dim_t kc_cur, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict aux, \
cntx_t* restrict cntx \
) \
{ \
/* Infer the datatype from the ctype. */ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype zero = *PASTEMAC(ch,0); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs.
NOTE: This initialization should really be done statically since
var2 executes this microkernel wrapper many times, and the overhead
of touching the temporary microtile adds up. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
\
/* Handle interior and edge cases separately. */ \
if ( mr_cur == MR && nr_cur == NR ) \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
alpha, \
a, \
b, \
beta, \
c, rs_c, cs_c, \
aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
alpha, \
a, \
b, \
&zero, \
ct, rs_ct, cs_ct, \
aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn) \
( \
mr_cur, \
nr_cur, \
ct, rs_ct, cs_ct, \
beta, \
c, rs_c, cs_c \
); \
} \
}
//INSERT_GENTFUNC_BASIC0( gemm_kernel )
GENTFUNC( float, s, gemm_kernel )
GENTFUNC( double, d, gemm_kernel )
GENTFUNC( scomplex, c, gemm_kernel )
GENTFUNC( dcomplex, z, gemm_kernel )

View File

@@ -0,0 +1,124 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype the object-based variant interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTECH(bls_,opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
GENPROT( gemm_bp_var1 )
GENPROT( gemm_bp_var2 )
//
// Prototype the typed variant interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
//INSERT_GENTPROT_BASIC0( gemm_bp_var1 )
GENTPROT( float, s, gemm_bp_var1 )
GENTPROT( double, d, gemm_bp_var1 )
GENTPROT( scomplex, c, gemm_bp_var1 )
GENTPROT( dcomplex, z, gemm_bp_var1 )
//INSERT_GENTPROT_BASIC0( gemm_bp_var2 )
GENTPROT( float, s, gemm_bp_var2 )
GENTPROT( double, d, gemm_bp_var2 )
GENTPROT( scomplex, c, gemm_bp_var2 )
GENTPROT( dcomplex, z, gemm_bp_var2 )
//
// Prototype the typed kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
const dim_t MR, \
const dim_t NR, \
dim_t mr_cur, \
dim_t nr_cur, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict aux, \
cntx_t* restrict cntx \
);
//INSERT_GENTPROT_BASIC0( gemm_kernel )
GENTPROT( float, s, gemm_kernel )
GENTPROT( double, d, gemm_kernel )
GENTPROT( scomplex, c, gemm_kernel )
GENTPROT( dcomplex, z, gemm_kernel )

View File

@@ -0,0 +1,328 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Set the pack buffer type so that we are obtaining memory blocks from
the pool dedicated to blocks of A. */ \
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
\
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
const dim_t k_pack = k; \
\
/* Barrier to make sure all threads are caught up and ready to begin the
packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was passed in.
It needs to be that mem_t struct, and not a local (temporary)
mem_t, since there is no barrier until after packing is finished,
which could allow a race condition whereby the chief thread exits
the current function before the other threads have a chance to
copy from it. (A barrier would fix that race condition, but then
again, I prefer to keep barriers to a minimum.) */ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t to all
threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
GENTFUNC( float, s, packm_init_mem_a )
GENTFUNC( double, d, packm_init_mem_a )
GENTFUNC( scomplex, c, packm_init_mem_a )
GENTFUNC( dcomplex, z, packm_init_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
GENTFUNC( float, s, packm_finalize_mem_a )
GENTFUNC( double, d, packm_finalize_mem_a )
GENTFUNC( scomplex, c, packm_finalize_mem_a )
GENTFUNC( dcomplex, z, packm_finalize_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
) \
{ \
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
*k_max = k; \
\
/* Determine the dimensions and strides for the packed matrix A. */ \
{ \
/* Pack A to column-stored row-panels. */ \
*rs_p = 1; \
*cs_p = mr; \
\
*pd_p = mr; \
*ps_p = mr * k; \
\
/* Set the schema to "packed row panels" to indicate packing to
conventional column-stored row panels. */ \
*schema = BLIS_PACKED_ROW_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the memory
associated with the mem_t entry acquired from the memory pool. */ \
*p = bli_mem_buffer( mem ); \
}
//INSERT_GENTFUNC_BASIC0( packm_init_a )
GENTFUNC( float, s, packm_init_a )
GENTFUNC( double, d, packm_init_a )
GENTFUNC( scomplex, c, packm_init_a )
GENTFUNC( dcomplex, z, packm_init_a )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
conj_t conj, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t m_max; \
dim_t k_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. */ \
PASTECH2(bls_,ch,packm_init_mem_a) \
( \
m_alloc, k_alloc, mr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix A. */ \
PASTECH2(bls_,ch,packm_init_a) \
( \
&schema, \
m, k, mr, \
&m_max, &k_max, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
mem \
); \
\
/* Pack matrix A to the destination buffer chosen above. Here, the packed
matrix is stored to column-stored MR x k micropanels. */ \
PASTECH2(bls_,ch,packm_var1) \
( \
conj, \
schema, \
m, \
k, \
m_max, \
k_max, \
kappa, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
}
//INSERT_GENTFUNC_BASIC0( packm_a )
GENTFUNC( float, s, packm_a )
GENTFUNC( double, d, packm_a )
GENTFUNC( scomplex, c, packm_a )
GENTFUNC( dcomplex, z, packm_a )

View File

@@ -0,0 +1,122 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
GENTPROT( float, s, packm_init_mem_a )
GENTPROT( double, d, packm_init_mem_a )
GENTPROT( scomplex, c, packm_init_mem_a )
GENTPROT( dcomplex, z, packm_init_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
GENTPROT( float, s, packm_finalize_mem_a )
GENTPROT( double, d, packm_finalize_mem_a )
GENTPROT( scomplex, c, packm_finalize_mem_a )
GENTPROT( dcomplex, z, packm_finalize_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
); \
//INSERT_GENTPROT_BASIC0( packm_init_a )
GENTPROT( float, s, packm_init_a )
GENTPROT( double, d, packm_init_a )
GENTPROT( scomplex, c, packm_init_a )
GENTPROT( dcomplex, z, packm_init_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
conj_t conj, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_a )
GENTPROT( float, s, packm_a )
GENTPROT( double, d, packm_a )
GENTPROT( scomplex, c, packm_a )
GENTPROT( dcomplex, z, packm_a )

View File

@@ -0,0 +1,328 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Set the pack buffer type so that we are obtaining memory blocks from
the pool dedicated to panels of B. */ \
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
\
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
const dim_t k_pack = k; \
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Barrier to make sure all threads are caught up and ready to begin the
packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was passed in.
It needs to be that mem_t struct, and not a local (temporary)
mem_t, since there is no barrier until after packing is finished,
which could allow a race condition whereby the chief thread exits
the current function before the other threads have a chance to
copy from it. (A barrier would fix that race condition, but then
again, I prefer to keep barriers to a minimum.) */ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t to all
threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
GENTFUNC( float, s, packm_init_mem_b )
GENTFUNC( double, d, packm_init_mem_b )
GENTFUNC( scomplex, c, packm_init_mem_b )
GENTFUNC( dcomplex, z, packm_init_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
GENTFUNC( float, s, packm_finalize_mem_b )
GENTFUNC( double, d, packm_finalize_mem_b )
GENTFUNC( scomplex, c, packm_finalize_mem_b )
GENTFUNC( dcomplex, z, packm_finalize_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
) \
{ \
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
*k_max = k; \
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Determine the dimensions and strides for the packed matrix B. */ \
{ \
/* Pack B to row-stored column-panels. */ \
*rs_p = nr; \
*cs_p = 1; \
\
*pd_p = nr; \
*ps_p = k * nr; \
\
/* Set the schema to "packed column panels" to indicate packing to
conventional row-stored column panels. */ \
*schema = BLIS_PACKED_COL_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the memory
associated with the mem_t entry acquired from the memory pool. */ \
*p = bli_mem_buffer( mem ); \
}
//INSERT_GENTFUNC_BASIC0( packm_init_b )
GENTFUNC( float, s, packm_init_b )
GENTFUNC( double, d, packm_init_b )
GENTFUNC( scomplex, c, packm_init_b )
GENTFUNC( dcomplex, z, packm_init_b )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
conj_t conj, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t k_max; \
dim_t n_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. */ \
PASTECH2(bls_,ch,packm_init_mem_b) \
( \
k_alloc, n_alloc, nr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix B. */ \
PASTECH2(bls_,ch,packm_init_b) \
( \
&schema, \
k, n, nr, \
&k_max, &n_max, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
mem \
); \
\
/* Pack matrix B to the destination buffer chosen above. Here, the packed
matrix is stored to row-stored k x NR micropanels. */ \
PASTECH2(bls_,ch,packm_var1) \
( \
conj, \
schema, \
k, \
n, \
k_max, \
n_max, \
kappa, \
b, rs_b, cs_b, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
}
//INSERT_GENTFUNC_BASIC0( packm_b )
GENTFUNC( float, s, packm_b )
GENTFUNC( double, d, packm_b )
GENTFUNC( scomplex, c, packm_b )
GENTFUNC( dcomplex, z, packm_b )

View File

@@ -0,0 +1,122 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
GENTPROT( float, s, packm_init_mem_b )
GENTPROT( double, d, packm_init_mem_b )
GENTPROT( scomplex, c, packm_init_mem_b )
GENTPROT( dcomplex, z, packm_init_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
GENTPROT( float, s, packm_finalize_mem_b )
GENTPROT( double, d, packm_finalize_mem_b )
GENTPROT( scomplex, c, packm_finalize_mem_b )
GENTPROT( dcomplex, z, packm_finalize_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
); \
//INSERT_GENTPROT_BASIC0( packm_init_b )
GENTPROT( float, s, packm_init_b )
GENTPROT( double, d, packm_init_b )
GENTPROT( scomplex, c, packm_init_b )
GENTPROT( dcomplex, z, packm_init_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bls_,ch,opname) \
( \
conj_t conj, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_b )
GENTPROT( float, s, packm_b )
GENTPROT( double, d, packm_b )
GENTPROT( scomplex, c, packm_b )
GENTPROT( dcomplex, z, packm_b )

View File

@@ -0,0 +1,198 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces to the variants.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic; \
dim_t ic0; \
doff_t ic_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool row_stored = bli_is_col_packed( schema ); \
/*bool col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
vs_c = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
vs_c = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
} \
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
ic += ic_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. (The
default is slab.) */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC(ch,packm_cxk) \
( \
conjc, \
schema, \
panel_dim_i, \
panel_dim_max, \
panel_len_i, \
panel_len_max_i, \
kappa_cast, \
c_use, vs_c, ldc, \
p_use, ldp, \
cntx \
); \
} \
\
/*
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
p_begin += ps_p; \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_var1 )
GENTFUNC( float, s, packm_var1 )
GENTFUNC( double, d, packm_var1 )
GENTFUNC( scomplex, c, packm_var1 )
GENTFUNC( dcomplex, z, packm_var1 )

View File

@@ -0,0 +1,63 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces to the variants.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bls_,ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
//INSERT_GENTPROT_BASIC0( packm_var1 )
GENTPROT( float, s, packm_var1 )
GENTPROT( double, d, packm_var1 )
GENTPROT( scomplex, c, packm_var1 )
GENTPROT( dcomplex, z, packm_var1 )

View File

@@ -0,0 +1,73 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_H
#define BLIS_SBX_L3_DECOR_H
// -- sup definitions ----------------------------------------------------------
// Level-3 sup internal function type.
typedef void (*l3sbxint_t)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Level-3 sup thread decorator prototype.
void bls_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
// Include definitions specific to the method of multithreading.
#include "bls_l3_decor_single.h"
#include "bls_l3_decor_openmp.h"
#include "bls_l3_decor_pthreads.h"
#endif

View File

@@ -0,0 +1,138 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_OPENMP
// Define a dummy thread entry function, which is needed in the pthreads
// version, so that when building Windows DLLs (with OpenMP enabled or with
// no multithreading) we don't risk having an unresolved symbol.
void* bls_l3_thread_entry( void* data_void ) { return NULL; }
//#define PRINT_THRINFO
void bls_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
#define BLIS_SBX_L3_DECOR_OPENMP_H
// Definitions specific to situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
#endif
#endif

View File

@@ -0,0 +1,213 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
// A data structure to assist in passing operands to additional threads.
typedef struct thread_data
{
l3sbxint_t func;
opid_t family;
obj_t* alpha;
obj_t* a;
obj_t* b;
obj_t* beta;
obj_t* c;
cntx_t* cntx;
rntm_t* rntm;
dim_t tid;
thrcomm_t* gl_comm;
array_t* array;
} thread_data_t;
// Entry point function for additional threads.
void* bls_l3_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
l3sbxint_t func = data->func;
opid_t family = data->family;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* b = data->b;
obj_t* beta = data->beta;
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
rntm_t* rntm = data->rntm;
dim_t tid = data->tid;
array_t* array = data->array;
thrcomm_t* gl_comm = data->gl_comm;
( void )family;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
return NULL;
}
void bls_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Query the total number of threads from the context.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
{
// Set up thread data for additional threads (beyond thread 0).
datas[tid].func = func;
datas[tid].family = family;
datas[tid].alpha = alpha;
datas[tid].a = a;
datas[tid].b = b;
datas[tid].beta = beta;
datas[tid].c = c;
datas[tid].cntx = cntx;
datas[tid].rntm = rntm;
datas[tid].tid = tid;
datas[tid].gl_comm = gl_comm;
datas[tid].array = array;
// Spawn additional threads for ids greater than 1.
if ( tid != 0 )
bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] );
else
bls_l3_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( dim_t tid = 1; tid < n_threads; tid++ )
{
bli_pthread_join( pthreads[tid], NULL );
}
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( pthreads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( datas );
}
#endif

View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
#define BLIS_SBX_L3_DECOR_PTHREADS_H
// Definitions specific to situations when POSIX multithreading is enabled.
#ifdef BLIS_ENABLE_PTHREADS
// Thread entry point prototype.
void* bls_l3_thread_entry( void* data_void );
#endif
#endif

View File

@@ -0,0 +1,141 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifndef BLIS_ENABLE_MULTITHREADING
#define SKIP_THRINFO_TREE
void bls_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
#ifndef SKIP_THRINFO_TREE
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
// There is only one thread id (for the thief thread).
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
#ifndef SKIP_THRINFO_TREE
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
#else
// This optimization allows us to use one of the global thrinfo_t
// objects for single-threaded execution rather than grow one from
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
// from within the variants, will immediately return if it detects
// that the thrinfo_t* passed into it is either
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
( void )tid;
#endif
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
#ifndef SKIP_THRINFO_TREE
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
#define BLIS_SBX_L3_DECOR_SINGLE_H
// Definitions specific to situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
#endif
#endif

View File

@@ -32,7 +32,14 @@
*/
// This file is needed for the BLIS build system.
// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
// entry point to any sandbox implementation.
// NOTE: This function is implemented identically to the function that it
// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
// forgoing the option of customizing the implementations that underlie
// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
// directory, however, will be included in the BLIS.
#include "blis.h"

View File

@@ -6,7 +6,7 @@ function r_val = plot_panel_4x5 ...
thr_str, ...
dirpath, ...
arch_str, ...
vend_str ...
vend_leg_str ...
)
impl = 'octave';
@@ -25,11 +25,13 @@ else
position = [100 100 1864 1540];
papersize = [15.6 19.4];
%leg_pos_st = [1.15 8.70 2.1 1.2 ]; % (dgemm)
leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
%leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
leg_pos_st = [15.90 13.60 2.1 1.2 ]; % (strsm)
%leg_pos_mt = [12.20 13.60 2.1 1.2 ]; % (strmm)
%leg_pos_mt = [5.30 12.60 2.1 1.2 ]; % (ssymm)
%leg_pos_mt = [8.50 13.62 2.1 1.2 ]; % (ssyrk)
leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
%leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
leg_pos_mt = [15.90 13.60 2.1 1.2 ]; % (strsm)
sp_margins = [ 0.068 0.051 ];
end
@@ -59,7 +61,7 @@ eige_str = 'eigen';
% Create filename "templates" for the files that contain the performance
% results.
filetemp = '%s/output_%s_%s_%s.m'
filetemp = '%s/output_%s_%s_%s.m';
filetemp_blis = sprintf( filetemp, '%s', '%s', '%s', blis_str );
filetemp_open = sprintf( filetemp, '%s', '%s', '%s', open_str );
filetemp_vend = sprintf( filetemp, '%s', '%s', '%s', vend_str );
@@ -102,7 +104,7 @@ for opi = 1:n_opnames
data_blis, ...
data_open, ...
data_eige, ...
data_vend, vend_str, ...
data_vend, vend_leg_str, ...
nth, ...
4, 5, ...
cfreq, ...

View File

@@ -24,7 +24,6 @@ plot_panel_4x5(2.60,16,64, '1s','../results/zen2/20200929/jc4ic4jr4','zen2','MKL
plot_panel_4x5(2.60,16,128,'2s','../results/zen2/20200929/jc8ic4jr4','zen2','MKL'); close all; clear all;
% a64fx
plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210316/st', 'a64fx','Fujitsu SSL2'); close all; clear all;
plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210316/jc1ic4jr3', 'a64fx','Fujitsu SSL2'); close all; clear all;
plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210316/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;
plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210520/st', 'a64fx','Fujitsu SSL2'); close all; clear all;
plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210520/jc1ic1jr12','a64fx','Fujitsu SSL2'); close all; clear all;
plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210520/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;

View File

@@ -254,18 +254,17 @@ void libblis_test_gemm_experiment
bli_setsc( 0.9, 1.0, &beta );
}
#if 0
//bli_setm( &BLIS_ONE, &a );
bli_setsc( 1.0, 0.0, &alpha );
bli_setsc( 1.0, 0.0, &beta );
#endif
// Randomize A, B, and C, and save C.
libblis_test_mobj_randomize( params, TRUE, &a );
libblis_test_mobj_randomize( params, TRUE, &b );
libblis_test_mobj_randomize( params, TRUE, &c );
bli_copym( &c, &c_save );
//bli_setm( &BLIS_ONE, &a );
//bli_setsc( 1.0, 0.0, &alpha );
//bli_setsc( 0.0, 0.0, &beta );
//bli_setm( &BLIS_ONE, &a );
//bli_setsc( 1.0, 0.0, &alpha );
//bli_setsc( 0.0, 0.0, &beta );
// Apply the parameters.
bli_obj_set_conjtrans( transa, &a );
@@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" );
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
//bli_printm( "c before", c, "%6.3f", "" );
bli_gemm( alpha, a, b, beta, c );
//bls_gemm( alpha, a, b, beta, c );
#if 0
if ( bli_obj_length( c ) == 12 &&
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
bli_printm( "c after", c, "%6.3f", "" );
#endif
//bli_printm( "c after", c, "%5.2f", "" );
break;
default: