Added 512b SVE-based a64fx subconfig + SVE kernels.

Details:
- Added 512-bit specific 'a64fx' subconfiguration that uses empirically 
  tuned block size by Stepan Nassyr. This subconfig also sets the sector 
  cache size and enables memory-tagging code in SVE gemm kernels. This 
  subconfig utilizes (16, k) and (10, k) DPACKM kernels.
- Added a vector-length agnostic 'armsve' subconfiguration that computes
  blocksizes according to the analytical model. This part is ported from 
  Stepan Nassyr's repository.
- Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE 
  at size (2*VL, 10). These kernels use unindexed FMLA instructions 
  because indexed FMLA takes 2 FMA units in many implementations.
  PS: There are indexed-FLMA kernels in Stepan Nassyr's repository.
- Implemented 512-bit SVE dpackm kernels with in-register transpose
  support for sizes (16, k) and (10, k).
- Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for 
  size (12, k). This dpackm kernel is not currently used by any 
  subconfiguration.
- Implemented several experimental dgemmsup kernels which would 
  improve performance in a few cases. However, those dgemmsup kernels 
  generally underperform hence they are not currently used in any 
  subconfig.
- Note: This commit squashes several commits submitted by RuQing Xu via
  PR #424.
This commit is contained in:
RuQing Xu
2021-05-19 23:52:29 +09:00
committed by GitHub
parent 5d46dbee4a
commit 61584deddf
34 changed files with 4957 additions and 9 deletions

View File

@@ -0,0 +1,117 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// A64FX: set up cache sizes
//
// Reference: A64FX (TM) specification Fujitsu HPC Extension
// Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
//
// 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 |
// RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
//
// the bits set number of maximum sectors from 0-7
// 000 - 0
// 001 - 1
// 010 - 2
// 011 - 3
// 100 - 4
// 101 - 5
// 110 - 6
// 111 - 7
//
// For L1 we want to maximize the number of sectors for B
// Configuration 1: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 6 sectors for B (sector 2)
// 0 sectors for the rest (sector 0)
//
// 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
//
// Configuration 2: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 5 sectors for B (sector 2)
// 1 sectors for the rest (sector 0)
//
// 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
//
// accessing the control register:
//
// MRS <Xt>, S3_3_C11_C8_2
// MSR S3_3_C11_C8_2, <Xt>
//
// TODO: First tests showed no change in performance, a deeper investigation
// is necessary
#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c11_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c15_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
" mov "#sparereg", "#tag" \n\t"\
" lsl "#sparereg", "#sparereg", 56 \n\t"\
" orr "#areg", "#areg", "#sparereg" \n\t"
#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
__asm__ volatile(\
"mrs %["#output_uint64"],s3_3_c11_c8_2"\
: [output_uint64] "=r" (output_uint64)\
: \
:\
);
#define A64FX_SCC(sec0,sec1,sec2,sec3)\
(uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
#define A64FX_SCC_L2(sec02,sec13)\
(uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))

View File

@@ -0,0 +1,151 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_a64fx_sector_cache.h"
void bli_cntx_init_a64fx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_a64fx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set SVE-512 packing routine.
bli_cntx_set_packm_kers
(
3,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
// Set A64FX cache sector sizes for each PE/CMG
// SC Fugaku might disable users' setting cache sizes.
#if !defined(CACHE_SECTOR_SIZE_READONLY)
#pragma omp parallel
{
A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
}
#endif
}

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_NUM_REGISTERS 32
//#endif

82
config/a64fx/make_defs.mk Normal file
View File

@@ -0,0 +1,82 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := a64fx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -0,0 +1,92 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dim_t bli_vl_bits_armsve(void)
{ \
uint64_t vl = 0;
__asm__ (
" mov x0, xzr \n\t"
" incb x0 \n\t"
" mov %[vl], x0 \n\t"
: [vl] "=r" (vl)
:
: "x0"
);
return vl;
}
#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
{ \
dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
\
dim_t vl_b = bli_vl_bits_armsve(); \
dim_t vl = vl_b / S_Data; \
dim_t m_r = 2 * vl; \
dim_t n_r = 10; \
\
dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
/ (n_r * S_Data); \
\
dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
m_c -= m_c % m_r; \
\
dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
n_c -= n_c % n_r; \
\
*m_r_ = m_r; \
*n_r_ = n_r; \
*k_c_ = k_c; \
*m_c_ = m_c; \
*n_c_ = n_c; \
}
EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )

View File

@@ -0,0 +1,42 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dim_t bli_vl_bits_armsve(void);
void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);

View File

@@ -0,0 +1,157 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_armsve_config_utils.h"
void bli_cntx_init_armsve( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
#if 0
blksz_t thresh[ BLIS_NUM_THRESH ];
#endif
// Set default kernel blocksizes and functions.
bli_cntx_init_armsve_ref( cntx );
// -------------------------------------------------------------------------
// Block size.
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set VL-specific packing routines if applicable.
if (m_r_d==16)
bli_cntx_set_packm_kers
(
3,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
else if (m_r_d==8)
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
}

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_NUM_REGISTERS 32
// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256
//#endif

View File

@@ -0,0 +1,82 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := armsve
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -32,6 +32,8 @@ piledriver: piledriver
bulldozer: bulldozer
# ARM architectures.
armsve: armsve/armsve
a64fx: a64fx/armsve
thunderx2: thunderx2/armv8a
cortexa57: cortexa57/armv8a
cortexa53: cortexa53/armv8a

View File

@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
#endif
// ARM microarchitectures.
#ifdef BLIS_FAMILY_ARMSVE
id = BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_FAMILY_A64FX
id = BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_FAMILY_THUNDERX2
id = BLIS_ARCH_THUNDERX2;
#endif
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
"thunderx2",
"cortexa57",
"cortexa53",
"armsve",
"a64fx",
"cortexa15",
"cortexa9",

View File

@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" );
printf("family = %x\n", family );
printf( "model = %x\n", model );
printf( "features = %x\n", features );
#endif
@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
{
// Check for each ARMv8 configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_ARMSVE
if ( bli_cpuid_is_armsve( model, part, features ) )
return BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_CONFIG_A64FX
if ( bli_cpuid_is_a64fx( model, part, features ) )
return BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_CONFIG_THUNDERX2
if ( bli_cpuid_is_thunderx2( model, part, features ) )
return BLIS_ARCH_THUNDERX2;
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
return TRUE;
}
bool bli_cpuid_is_armsve
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_a64fx
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_cortexa15
(
uint32_t family,
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
strstr( feat_str, "asimd" ) != NULL )
*features |= FEATURE_NEON;
// Parse the feature string to check for SVE features.
if ( strstr( feat_str, "sve" ) != NULL )
*features |= FEATURE_SVE;
//printf( "bli_cpuid_query(): features var: %u\n", *features );
// Parse the processor string to uncover the model.

View File

@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
@@ -175,7 +177,8 @@ enum
};
enum
{
FEATURE_NEON = 0x1
FEATURE_NEON = 0x01,
FEATURE_SVE = 0x02
};
#endif

View File

@@ -144,6 +144,16 @@ void bli_gks_init( void )
bli_cntx_init_cortexa53_ref,
bli_cntx_init_cortexa53_ind );
#endif
#ifdef BLIS_CONFIG_ARMSVE
bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve,
bli_cntx_init_armsve_ref,
bli_cntx_init_armsve_ind );
#endif
#ifdef BLIS_CONFIG_A64FX
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
bli_cntx_init_a64fx_ref,
bli_cntx_init_a64fx_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA15
bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15,
bli_cntx_init_cortexa15_ref,

View File

@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )
// -- ARM architectures --
#ifdef BLIS_CONFIG_ARMSVE
CNTX_INIT_PROTS( armsve )
#endif
#ifdef BLIS_CONFIG_A64FX
CNTX_INIT_PROTS( a64fx )
#endif
#ifdef BLIS_CONFIG_THUNDERX2
CNTX_INIT_PROTS( thunderx2 )
#endif
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )
// -- ARM architectures --
#ifdef BLIS_FAMILY_ARMSVE
#include "bli_family_armsve.h"
#endif
#ifdef BLIS_FAMILY_A64FX
#include "bli_family_a64fx.h"
#endif
#ifdef BLIS_FAMILY_THUNDERX2
#include "bli_family_thunderx2.h"
#endif

View File

@@ -1005,6 +1005,8 @@ typedef enum
BLIS_ARCH_BULLDOZER,
// ARM
BLIS_ARCH_ARMSVE,
BLIS_ARCH_A64FX,
BLIS_ARCH_THUNDERX2,
BLIS_ARCH_CORTEXA57,
BLIS_ARCH_CORTEXA53,
@@ -1029,7 +1031,7 @@ typedef enum
// NOTE: This value must be updated to reflect the number of enum values
// listed above for arch_t!
#define BLIS_NUM_ARCHS 22
//#define BLIS_NUM_ARCHS 25
//

View File

@@ -0,0 +1,45 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
"trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
"trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
"compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
"compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
"compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
"compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
"compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
"compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"

View File

@@ -0,0 +1,97 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
"ptrue " #PT".d \n\t" \
"mov " #XTMP", #2 \n\t" \
"whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
"mov " #XTMP", #4 \n\t" \
"whilelo " #P4".d, xzr, " #XTMP" \n\t" \
"mov " #XTMP", #6 \n\t" \
"whilelo " #P6".d, xzr, " #XTMP" \n\t" \
\
"eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
"orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
\
"not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
"not " #P4C".b, " #PT"/z, " #P4".b \n\t" \
"not " #P6C".b, " #PT"/z, " #P6".b \n\t" \
#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
"trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
"trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
"trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
"trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
"trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
"trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
"trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
"trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
\
"compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
"compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
"ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
"ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
"compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
"compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
"ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
\
"sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
"sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
"sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
"sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
"sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
"sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
"sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
"sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
\
"compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
"compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
"compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
"compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
"ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
"ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
"ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
\
"sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
"sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
"sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
"sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
"sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
"sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
"sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
"sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"

View File

@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
dim_t cdim_,
dim_t n_,
dim_t n_max_,
void* restrict kappa_,
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
double* a = ( double* )a_;
double* p = ( double* )p_;
double* kappa = ( double* )kappa_;
const int64_t cdim = cdim_;
const int64_t mnr = 8;
const int64_t n = n_;

View File

@@ -0,0 +1,365 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
#include "armsve512_asm_transpose_d8x2.h"
// assumption:
// SVE vector length = 512 bits.
void bli_dpackm_armsve512_asm_10xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 10;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
const bool gs = inca != 1 && lda != 1;
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif
if ( cdim == mnr && !gs && unitk )
{
uint64_t n_mker = n / 8;
uint64_t n_left = n % 8;
__asm__ volatile (
"mov x0, %[a] \n\t"
"mov x1, %[p] \n\t"
"mov x2, %[ldp] \n\t"
"mov x3, %[lda] \n\t"
"mov x4, %[inca] \n\t"
"cmp x4, #1 \n\t"
// Skips by sizeof(double).
"mov x8, #8 \n\t"
"madd x2, x2, x8, xzr \n\t"
"madd x3, x3, x8, xzr \n\t"
"madd x4, x4, x8, xzr \n\t"
// Loop constants.
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
// A stored in columns.
" .ACOLSTOR: \n\t"
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
#ifdef _A64FX
// Disable hardware prefetch for A.
"mov x16, 0x6 \n\t"
"lsl x16, x16, #60 \n\t"
"orr x0, x0, x16 \n\t"
#endif
" .ACOLSTORMKER: \n\t"
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ldr q1, [x0, #64] \n\t"
"ld1d z2.d, p0/z, [x5] \n\t"
"ldr q3, [x5, #64] \n\t"
"ld1d z4.d, p0/z, [x6] \n\t"
"ldr q5, [x6, #64] \n\t"
"ld1d z6.d, p0/z, [x7] \n\t"
"ldr q7, [x7, #64] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x0, x7, x3 \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z8.d, p0/z, [x0] \n\t"
"ldr q9, [x0, #64] \n\t"
"ld1d z10.d, p0/z, [x5] \n\t"
"ldr q11, [x5, #64] \n\t"
"ld1d z12.d, p0/z, [x6] \n\t"
"ldr q13, [x6, #64] \n\t"
"ld1d z14.d, p0/z, [x7] \n\t"
"ldr q15, [x7, #64] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
// Plain storage
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"st1d z2.d, p0, [x10] \n\t"
"str q3, [x10, #64] \n\t"
"st1d z4.d, p0, [x11] \n\t"
"str q5, [x11, #64] \n\t"
"st1d z6.d, p0, [x12] \n\t"
"str q7, [x12, #64] \n\t"
"st1d z8.d, p0, [x13] \n\t"
"str q9, [x13, #64] \n\t"
"st1d z10.d, p0, [x14] \n\t"
"str q11, [x14, #64] \n\t"
"st1d z12.d, p0, [x15] \n\t"
"str q13, [x15, #64] \n\t"
"st1d z14.d, p0, [x16] \n\t"
"str q15, [x16, #64] \n\t"
"add x1, x16, x2 \n\t"
// Realign and store.
// "ext z1.b, z1.b, z1.b, #16 \n\t"
// "ext z1.b, z1.b, z2.b, #48 \n\t"
// "ext z2.b, z2.b, z3.b, #16 \n\t"
// "ext z2.b, z2.b, z4.b, #32 \n\t"
// "ext z4.b, z4.b, z5.b, #16 \n\t"
// "ext z4.b, z4.b, z6.b, #16 \n\t"
// "ext z6.b, z6.b, z7.b, #16 \n\t"
// "ext z9.b, z9.b, z9.b, #16 \n\t"
// "ext z9.b, z9.b, z10.b, #48 \n\t"
// "ext z10.b, z10.b, z11.b, #16 \n\t"
// "ext z10.b, z10.b, z12.b, #32 \n\t"
// "ext z12.b, z12.b, z13.b, #16 \n\t"
// "ext z12.b, z12.b, z14.b, #16 \n\t"
// "ext z14.b, z14.b, z15.b, #16 \n\t"
// "st1d z0.d, p0, [x1] \n\t"
// "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
// "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
// "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
// "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
// "add x1, x1, #320 \n\t"
// "st1d z8.d, p0, [x1] \n\t"
// "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
// "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
// "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
// "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
// "add x1, x1, #320 \n\t"
"add x0, x7, x3 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ldr q1, [x0, #64] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
// A stored in rows.
" .AROWSTOR: \n\t"
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"add x17, x16, x4 \n\t"
"add x18, x17, x4 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x10] \n\t"
"ld1d z2.d, p0/z, [x11] \n\t"
"ld1d z3.d, p0/z, [x12] \n\t"
"ld1d z4.d, p0/z, [x13] \n\t"
"ld1d z5.d, p0/z, [x14] \n\t"
"ld1d z6.d, p0/z, [x15] \n\t"
"ld1d z7.d, p0/z, [x16] \n\t"
"ld1d z22.d, p0/z, [x17] \n\t"
"ld1d z23.d, p0/z, [x18] \n\t"
// Transpose first 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
// Transpose last 2 rows.
SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
// Plain storage.
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z8.d, p0, [x1] \n\t"
"str q16, [x1, #64] \n\t"
"st1d z9.d, p0, [x10] \n\t"
"str q17, [x10, #64] \n\t"
"st1d z10.d, p0, [x11] \n\t"
"str q18, [x11, #64] \n\t"
"st1d z11.d, p0, [x12] \n\t"
"str q19, [x12, #64] \n\t"
"st1d z12.d, p0, [x13] \n\t"
"str q20, [x13, #64] \n\t"
"st1d z13.d, p0, [x14] \n\t"
"str q21, [x14, #64] \n\t"
"st1d z14.d, p0, [x15] \n\t"
"str q22, [x15, #64] \n\t"
"st1d z15.d, p0, [x16] \n\t"
"str q23, [x16, #64] \n\t"
"add x1, x16, x2 \n\t"
"add x0, x0, #64 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"add x6, x0, x5 \n\t"
"add x7, x6, x4 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
"ldr d1, [x6] \n\t"
"ldr d2, [x7] \n\t"
"trn1 v1.2d, v1.2d, v2.2d \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
"mov x0, #0 \n\t"
:
: [a] "r" (a),
[p] "r" (p),
[lda] "r" (lda),
[ldp] "r" (ldp),
[inca] "r" (inca),
[n_mker] "r" (n_mker),
[n_left] "r" (n_left)
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x8", "x9", "x10","x11","x12","x13","x14","x15",
"x16","x17","x18",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19","z20","z21","z22","z23",
// "z24","z25","z26","z27","z28","z29",
"z30","z31",
"p0", "p1", "p2", "p3", "p4", // "p5",
"p6", "p7", "p8"
);
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,359 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Linaro Limited
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <stdio.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#else
#error "No Arm SVE intrinsics support in compiler"
#endif // __ARM_FEATURE_SVE
// assumption:
// SVE vector length = 512 bits.
// TODO:
// 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
// prefetching is needed.
void bli_dpackm_armsve512_asm_12xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 12;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
double* restrict alpha1 = a;
double* restrict alpha1_8 = alpha1 + 8 * inca;
double* restrict alpha1_p4 = alpha1 + 4 * inca;
double* restrict alpha1_m4 = alpha1 - 4 * inca;
double* restrict pi1 = p;
const svbool_t all_active = svptrue_b64();
const svbool_t first_half_active = svwhilelt_b64(0, 4);
const svbool_t last_half_active = svnot_z(all_active, first_half_active);
svfloat64_t z_a0;
svfloat64_t z_a8;
svfloat64_t z_a8_lh;
svfloat64_t z_a16;
svuint64_t z_index;
// creating index for gather/scatter
// with each element as: 0, 1*inca, 2*inca, 3*inca
z_index = svindex_u64( 0, inca * sizeof( double ) );
if ( cdim == mnr )
{
if ( bli_deq1( *kappa ) )
{
if ( inca == 1 ) // continous memory. packA style
{
dim_t k = n;
// 2 pack into 3 case.
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// load 12 continuous elments from *a, filling last half of z8.
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_f64( all_active, alpha1_p4 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
// line-by-line packing case.
for ( ; k != 0; --k )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// store them into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
else // gather/scatter load/store. packB style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// gather load from *a, filling last half of z8.
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// scatter store into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
}
else // *kappa != 1.0
{
// load kappa into vector
svfloat64_t z_kappa;
z_kappa = svdup_f64( *kappa );
if ( inca == 1 ) // continous memory. packA style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// load 12 continuous elments from *a, filling last half of z8.
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_f64( all_active, alpha1_p4 );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// load 12 continuous elments from *a
z_a0 = svld1_f64( all_active, alpha1 );
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
// store them into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
else // gather/scatter load/store. packB style
{
dim_t k = n;
if ( ldp == mnr )
{
for ( ; k > 1; k -= 2 )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// forward address - 0 to 1
alpha1 += lda;
alpha1_p4 = alpha1 + 4 * inca;
alpha1_m4 = alpha1 - 4 * inca;
// gather load from *a, filling last half of z8.
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
// stored packed data into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
// forward address - 1 to 0
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += 2 * ldp;
}
}
for ( ; k != 0; --k )
{
// gather load from *a
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
// multiply by *kappa
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
// scatter store into *p
svst1_f64( all_active, pi1, z_a0 );
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
alpha1 += lda;
alpha1_8 = alpha1 + 8 * inca;
pi1 += ldp;
}
}
} // end of if ( *kappa == 1.0 )
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,363 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
// assumption:
// SVE vector length = 512 bits.
void bli_dpackm_armsve512_asm_16xk
(
conj_t conja,
pack_t schema,
dim_t cdim_,
dim_t n_,
dim_t n_max_,
double* restrict kappa,
double* restrict a, inc_t inca_, inc_t lda_,
double* restrict p, inc_t ldp_,
cntx_t* restrict cntx
)
{
const int64_t cdim = cdim_;
const int64_t mnr = 16;
const int64_t n = n_;
const int64_t n_max = n_max_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
const bool gs = inca != 1 && lda != 1;
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif
if ( cdim == mnr && !gs && unitk )
{
uint64_t n_mker = n / 8;
uint64_t n_left = n % 8;
__asm__ volatile (
"mov x0, %[a] \n\t"
"mov x1, %[p] \n\t"
"mov x2, %[ldp] \n\t"
"mov x3, %[lda] \n\t"
"mov x4, %[inca] \n\t"
"cmp x4, #1 \n\t"
// Skips by sizeof(double).
"mov x8, #8 \n\t"
"madd x2, x2, x8, xzr \n\t"
"madd x3, x3, x8, xzr \n\t"
"madd x4, x4, x8, xzr \n\t"
// "mov x8, 0x8 \n\t" // Control#0 for A address.
// "mov x8, 0x24 \n\t" // Higher 6bit for Control#0:
// "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
// "orr x8, x8, x3 \n\t" // Stride.
// "msr S3_3_C11_C6_0, x8 \n\t" // Write system register.
// Loop constants.
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
// A stored in columns.
" .ACOLSTOR: \n\t"
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
#ifdef _A64FX
"mov x16, 0x6 \n\t" // Disable hardware prefetch for A.
"lsl x16, x16, #60 \n\t"
"orr x0, x0, x16 \n\t"
#endif
// "add x5, x0, x3 \n\t"
// "add x6, x5, x3 \n\t"
// "add x7, x6, x3 \n\t"
// "prfm PLDL1STRM, [x0] \n\t"
// "prfm PLDL1STRM, [x5] \n\t"
// "prfm PLDL1STRM, [x6] \n\t"
// "prfm PLDL1STRM, [x7] \n\t"
// "add x18, x7, x3 \n\t"
// "add x5, x18, x3 \n\t"
// "add x6, x5, x3 \n\t"
// "add x7, x6, x3 \n\t"
// "prfm PLDL1STRM, [x18] \n\t"
// "prfm PLDL1STRM, [x5] \n\t"
// "prfm PLDL1STRM, [x6] \n\t"
// "prfm PLDL1STRM, [x7] \n\t"
" .ACOLSTORMKER: \n\t"
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
"ld1d z2.d, p0/z, [x5] \n\t"
"ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
"ld1d z4.d, p0/z, [x6] \n\t"
"ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
"ld1d z6.d, p0/z, [x7] \n\t"
"ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x0, x7, x3 \n\t"
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
"ld1d z8.d, p0/z, [x0] \n\t"
"ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
"ld1d z10.d, p0/z, [x5] \n\t"
"ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
"ld1d z12.d, p0/z, [x6] \n\t"
"ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
"ld1d z14.d, p0/z, [x7] \n\t"
"ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
"add x18, x17, x0 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x5 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x6 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"add x18, x17, x7 \n\t"
"prfm PLDL1STRM, [x18] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"st1d z2.d, p0, [x10] \n\t"
"st1d z3.d, p0, [x10, #1, mul vl] \n\t"
"st1d z4.d, p0, [x11] \n\t"
"st1d z5.d, p0, [x11, #1, mul vl] \n\t"
"st1d z6.d, p0, [x12] \n\t"
"st1d z7.d, p0, [x12, #1, mul vl] \n\t"
"st1d z8.d, p0, [x13] \n\t"
"st1d z9.d, p0, [x13, #1, mul vl] \n\t"
"st1d z10.d, p0, [x14] \n\t"
"st1d z11.d, p0, [x14, #1, mul vl] \n\t"
"st1d z12.d, p0, [x15] \n\t"
"st1d z13.d, p0, [x15, #1, mul vl] \n\t"
"st1d z14.d, p0, [x16] \n\t"
"st1d z15.d, p0, [x16, #1, mul vl] \n\t"
"add x0, x7, x3 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
// A stored in rows.
" .AROWSTOR: \n\t"
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x10] \n\t"
"ld1d z2.d, p0/z, [x11] \n\t"
"ld1d z3.d, p0/z, [x12] \n\t"
"ld1d z4.d, p0/z, [x13] \n\t"
"ld1d z5.d, p0/z, [x14] \n\t"
"ld1d z6.d, p0/z, [x15] \n\t"
"ld1d z7.d, p0/z, [x16] \n\t"
"add x5, x16, x4 \n\t"
"add x10, x5, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
"add x13, x12, x4 \n\t"
"add x14, x13, x4 \n\t"
"add x15, x14, x4 \n\t"
"add x16, x15, x4 \n\t"
"ld1d z16.d, p0/z, [x5] \n\t"
"ld1d z17.d, p0/z, [x10] \n\t"
"ld1d z18.d, p0/z, [x11] \n\t"
"ld1d z19.d, p0/z, [x12] \n\t"
"ld1d z20.d, p0/z, [x13] \n\t"
"ld1d z21.d, p0/z, [x14] \n\t"
"ld1d z22.d, p0/z, [x15] \n\t"
"ld1d z23.d, p0/z, [x16] \n\t"
// Transpose first 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
// Transpose last 8 rows.
SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
"add x10, x1, x2 \n\t"
"add x11, x10, x2 \n\t"
"add x12, x11, x2 \n\t"
"add x13, x12, x2 \n\t"
"add x14, x13, x2 \n\t"
"add x15, x14, x2 \n\t"
"add x16, x15, x2 \n\t"
"st1d z8.d, p0, [x1] \n\t"
"st1d z24.d, p0, [x1, #1, mul vl] \n\t"
"st1d z9.d, p0, [x10] \n\t"
"st1d z25.d, p0, [x10, #1, mul vl] \n\t"
"st1d z10.d, p0, [x11] \n\t"
"st1d z26.d, p0, [x11, #1, mul vl] \n\t"
"st1d z11.d, p0, [x12] \n\t"
"st1d z27.d, p0, [x12, #1, mul vl] \n\t"
"st1d z12.d, p0, [x13] \n\t"
"st1d z28.d, p0, [x13, #1, mul vl] \n\t"
"st1d z13.d, p0, [x14] \n\t"
"st1d z29.d, p0, [x14, #1, mul vl] \n\t"
"st1d z14.d, p0, [x15] \n\t"
"st1d z30.d, p0, [x15, #1, mul vl] \n\t"
"st1d z15.d, p0, [x16] \n\t"
"st1d z31.d, p0, [x16, #1, mul vl] \n\t"
"add x0, x0, #64 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
"add x6, x0, x5 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
"ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
"mov x0, #0 \n\t"
:
: [a] "r" (a),
[p] "r" (p),
[lda] "r" (lda),
[ldp] "r" (ldp),
[inca] "r" (inca),
[n_mker] "r" (n_mker),
[n_left] "r" (n_left)
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x8", "x9", "x10","x11","x12","x13","x14","x15",
"x16","x17","x18",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10","z11","z12","z13","z14","z15",
// "z16","z17","z18","z19","z20","z21","z22","z23",
// "z24","z25","z26","z27","z28","z29","z30","z31",
"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
);
}
else // if ( cdim < mnr )
{
bli_dscal2m_ex
(
0,
BLIS_NONUNIT_DIAG,
BLIS_DENSE,
( trans_t )conja,
cdim,
n,
kappa,
a, inca, lda,
p, 1, ldp,
cntx,
NULL
);
// if ( cdim < mnr )
{
const dim_t i = cdim;
const dim_t m_edge = mnr - i;
const dim_t n_edge = n_max;
double* restrict p_edge = p + (i )*1;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}
if ( n < n_max )
{
const dim_t j = n;
const dim_t m_edge = mnr;
const dim_t n_edge = n_max - j;
double* restrict p_edge = p + (j )*ldp;
bli_dset0s_mxn
(
m_edge,
n_edge,
p_edge, 1, ldp
);
}
}

View File

@@ -0,0 +1,191 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
\
GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
// Second through forth microkernels are the first one with B vectors rotated.
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
// NOTE:
// The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
// (sth. akin to loop-invariant):
// - BV[0-7] holds B[0:7, 4*k_cur]
// - B's address stops at B[0, 4*k_cur+1]
// Final loop inside K=4 microkernels.
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
// K=4 MKer loop with B memory scattered.
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
" mov "#BELMADDR", "#BADDR" \n\t" \
GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
\
GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
" mov "#BELMADDR", "#BADDR" \n\t" \
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
CLEAR_COL4(Z00,Z01,Z02,Z03) \
CLEAR_COL4(Z04,Z05,Z06,Z07) \
CLEAR_COL4(Z08,Z09,Z10,Z11) \
CLEAR_COL4(Z12,Z13,Z14,Z15) \
CLEAR_COL4(Z16,Z17,Z18,Z19)
#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)

View File

@@ -0,0 +1,123 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define CLEAR_COL2(Z0,Z1) \
" dup "#Z0"."DT", #0 \n\t" \
" dup "#Z1"."DT", #0 \n\t"
#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
CLEAR_COL2(Z0,Z1) \
CLEAR_COL2(Z2,Z3)
#define SCALE_COL2(Z0,Z1,ZFACTOR) \
" fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
" fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
SCALE_COL2(Z0,Z1,ZFACTOR) \
SCALE_COL2(Z2,Z3,ZFACTOR)
// Prefetch or not.
#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
" prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
" fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
" "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
" add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
// Prefetch or not.
#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
" add "#ATEMP", "#AADDR", "#APS" \n\t" \
GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \
" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
" fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
" fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use double precision.
#define DT "d"
#define LD1 "ld1d"
#define ST1 "st1d"
#define LD1R "ld1rd"
#define PRFG "prfd"
#define SZ "8"
#define OFFS "lsl #3"
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use half precision.
#define DT "h"
#define LD1 "ld1h"
#define ST1 "st1h"
#define LD1R "ld1rh"
#define PRFG "prfh"
#define SZ "2"
// #define OFFS UNSUPPORTED
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Specify to use single precision.
#define DT "s"
#define LD1 "ld1w"
#define ST1 "st1w"
#define LD1R "ld1rw"
#define PRFG "prfw"
#define SZ "4"
#define OFFS "uxtw #2"
// Include macros.
#include "armsve_asm_macros.h"

View File

@@ -0,0 +1,318 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Double-precision composite instructions.
#include "armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
void bli_dgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" incd x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.d \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
" ld1rd z28.d, p0/z, [x1, 64] \n\t"
" ld1rd z29.d, p0/z, [x1, 72] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ldr x4, [x4] \n\t" // Load alpha & beta (value).
" ldr x8, [x8] \n\t"
" dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.d, x8 \n\t"
" fmov d28, #1.0 \n\t" // Prepare FP 1.0.
" fmov x16, d28 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
#ifdef _A64FX
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" prfm PLDL1STRM, [x0] \n\t"
" prfm PLDL1STRM, [x0, 256*1] \n\t"
// " prfm PLDL2KEEP, [x0, 256*2] \n\t"
// " prfm PLDL2KEEP, [x0, 256*3] \n\t"
// " prfm PLDL2KEEP, [x0, 256*4] \n\t"
// " prfm PLDL2KEEP, [x0, 256*5] \n\t"
// " prfm PLDL2KEEP, [x0, 256*6] \n\t"
// " prfm PLDL2KEEP, [x0, 256*7] \n\t"
// " prfm PLDL2KEEP, [x0, 256*8] \n\t"
// " prfm PLDL2KEEP, [x0, 256*9] \n\t"
// " prfm PLDL2KEEP, [x0, 256*10] \n\t"
// " prfm PLDL2KEEP, [x0, 256*11] \n\t"
// " prfm PLDL2KEEP, [x0, 256*12] \n\t"
// " prfm PLDL2KEEP, [x0, 256*13] \n\t"
// " prfm PLDL2KEEP, [x0, 256*14] \n\t"
// " prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL1STRM, [x1] \n\t"
" prfm PLDL1STRM, [x1, 256*1] \n\t"
// " prfm PLDL2KEEP, [x1, 256*2] \n\t"
// " prfm PLDL2KEEP, [x1, 256*3] \n\t"
// " prfm PLDL2KEEP, [x1, 256*4] \n\t"
// " prfm PLDL2KEEP, [x1, 256*5] \n\t"
// " prfm PLDL2KEEP, [x1, 256*6] \n\t"
// " prfm PLDL2KEEP, [x1, 256*7] \n\t"
// " prfm PLDL2KEEP, [x1, 256*8] \n\t"
// " prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t" // Preload first half of C for contiguous case.
" b.ne WRITE_MEM \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" cmp x16, x4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
// First half of C is already loaded in this case.
GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,307 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Single-precision composite instructions.
#include "armsve_asm_macros_single.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
void bli_sgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" incw x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #4 \n\t" // Multiply some address skips by sizeof(float).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.s \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
" ld1rw z28.s, p0/z, [x1, 32] \n\t"
" ld1rw z29.s, p0/z, [x1, 36] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ldr w4, [x4] \n\t" // Load alpha & beta (value).
" ldr w8, [x8] \n\t"
" dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.s, w8 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
" prfm PLDL2KEEP, [x0] \n\t"
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov s28, #1.0 \n\t"
" fmov w16, s28 \n\t"
" cmp w16, w4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,343 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Half-precision composite instructions.
#include "armsve_asm_macros_half.h"
// 2vx10 microkernels.
#include "armsve_asm_2vx10.h"
// Gather-load / scatter-store instruction for half-precision
// needs being defined separately.
#undef GEMM_CCOL_GATHER_LOAD_FWD
#undef GEMM_CCOL_SCATTER_STORE_FWD
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
" add x28, "#CADDR", "#CRS2" \n\t" \
" ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
" ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
" fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" add x28, "#CTEMP", "#CRS2" \n\t" \
" ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
" ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
" fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
" add x28, "#CADDR", "#CRS2" \n\t" \
" st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
" st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
" add x28, "#CTEMP", "#CRS2" \n\t" \
" st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
" st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
" add "#CADDR", "#CADDR", "#CCS" \n\t"
void bli_shgemm_armsve_asm_2vx10_unindexed
(
dim_t k0,
void* restrict alpha,
void* restrict a,
void* restrict b,
void* restrict beta,
void* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile (
" ldr x0, %[a] \n\t"
" ldr x1, %[b] \n\t"
" mov x2, xzr \n\t"
" inch x2, ALL, MUL #2 \n\t" // Column-skip of A.
" mov x3, #10 \n\t" // Row-skip of B.
" \n\t"
" ldr x5, %[c] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x8, 0x3 \n\t" // Tag C address.
" lsl x8, x8, #56 \n\t"
" orr x5, x5, x8 \n\t"
" mov x8, 0x2 \n\t" // Tag B address.
" lsl x8, x8, #56 \n\t"
" orr x1, x1, x8 \n\t"
" mov x8, 0x1 \n\t" // Tag A address.
" lsl x8, x8, #56 \n\t"
" orr x0, x0, x8 \n\t"
#endif
" \n\t"
" mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" ptrue p0.b \n\t"
" \n\t"
" ldr x4, %[k_mker] \n\t" // Number of loops.
" ldr x8, %[k_left] \n\t"
" \n\t"
" LOAD_ABC: \n\t"
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" \n\t"
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x8, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
" ld1rh z28.h, p0/z, [x1, 16] \n\t"
" ld1rh z29.h, p0/z, [x1, 18] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x0, x0, x2 \n\t" // Forward A.
" add x1, x1, x3 \n\t" // Forward B.
" sub x8, x8, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
" ldr x8, %[beta] \n\t"
" ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors.
" ld1rh z31.h, p0/z, [x8] \n\t"
" fmov w4, h28 \n\t" // Copy alpha & beta to GP registers.
" fmov w8, h29 \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x0, %[a_next] \n\t"
" ldr x1, %[b_next] \n\t"
" prfm PLDL2KEEP, [x0] \n\t"
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov h28, #1.0 \n\t"
" fmov w16, h28 \n\t"
" cmp w16, w4 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x10, xzr \n\t"
" incb x10 \n\t"
" madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip.
" mov x28, #2 \n\t"
" madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case.
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
" \n\t"
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" b END_EXEC \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [a] "m" (a),
[b] "m" (b),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x16","x10","x28",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}

View File

@@ -0,0 +1,450 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Separate instantiation for ArmSVE reference kernels.
// Temporary workaround. Will be removed after upstream has switched to a better way
// of exposing gemmsup interface.
//
// -- Row storage case ---------------------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
//
// -- Column storage case ------------------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )

View File

@@ -0,0 +1,528 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
// Double-precision composite instructions.
#include "../armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "../armsve_asm_2vx10.h"
// Prototype reference kernel.
GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 )
void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
(
conj_t conja,
conj_t conjb,
dim_t m0,
dim_t n0,
dim_t k0,
double* restrict alpha,
double* restrict a, inc_t rs_a0, inc_t cs_a0,
double* restrict b, inc_t rs_b0, inc_t cs_b0,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
static int called = 0;
if ( !called )
{
fprintf(stderr, "rv called.\n");
called = 1;
}
// c*c requires A to be stored in columns.
assert( rs_a0 == 1 );
dim_t n0_mker = n0 / 10;
dim_t n0_left = n0 % 10;
if ( n0_left )
{
// A[:, ::]
// B[::, n0_mker*10:n0]
// C[: , n0_mker*10:n0]
double *ai = a;
double *bi = b + n0_mker * 10 * cs_b0;
double *ci = c + n0_mker * 10 * cs_c0;
bli_dgemmsup_c_armsve_ref2
(
conja, conjb,
m0, n0_left, k0,
alpha,
ai, rs_a0, cs_a0,
bi, rs_b0, cs_b0,
beta,
ci, rs_c0, cs_c0,
data,
cntx
);
}
// Return if it's a pure edge case.
if ( !n0_mker )
return;
// Determine VL.
uint64_t vlen2;
__asm__ (
" mov x0, xzr \n\t"
" incd x0, ALL, MUL #2 \n\t"
" mov %[vlen2], x0 \n\t"
: [vlen2] "=r" (vlen2)
:
: "x0"
);
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// uint64_t rs_a = 1;
uint64_t cs_a = cs_a0;
uint64_t rs_b = rs_b0;
uint64_t cs_b = cs_b0;
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t n_mker = n0_mker;
dim_t m0_mker = m0 / vlen2;
dim_t m0_left = m0 % vlen2;
if ( m0_left )
{
// Edge case on A side can be handled with one more (predicated) loop.
m0_mker++;
} else
m0_left = vlen2;
// uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_b = bli_auxinfo_ps_b( data );
for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
{
uint64_t m_curr = vlen2;
if ( im0_mker == m0_mker - 1 )
{
// Last m-loop. Maybe unnecessary.
m_curr = m0_left;
}
double *ai = a + im0_mker * vlen2 * rs_a0;
double *bi = b;
double *ci = c + im0_mker * vlen2 * rs_c0;
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
__asm__ volatile (
" ldr x0, %[bi] \n\t"
" ldr x1, %[rs_b] \n\t" // Row-skip of B.
" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]).
" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B.
" ldr x4, %[cs_a] \n\t" // Column-Skip of A.
" \n\t" // Element skip of A[:, l] is guaranteed to be 1.
" ldr x5, %[ci] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x16, 0x1 \n\t" // Tag C address.
" lsl x16, x16, #56 \n\t"
" orr x5, x5, x16 \n\t"
" mov x16, 0x2 \n\t" // Tag B address.
" lsl x16, x16, #56 \n\t"
" orr x0, x0, x16 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x1, x8, x1, xzr \n\t" // rs_b
" madd x2, x8, x2, xzr \n\t" // cs_b
" madd x3, x8, x3, xzr \n\t" // ps_b
" madd x4, x8, x4, xzr \n\t" // cs_a
" madd x7, x8, x7, xzr \n\t" // cs_c
" mov x8, #4 \n\t"
" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A.
" \n\t"
#ifdef _A64FX
" mov x16, 0x20 \n\t" // Higher 6bit for Control#2:
" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
" orr x16, x16, x4 \n\t" // Stride.
" msr S3_3_C11_C6_2, x16 \n\t" // Write system register.
#endif
" \n\t"
" ldr x8, %[m_curr] \n\t" // Size of first dimension.
" mov x9, xzr \n\t"
" incd x9 \n\t"
" ptrue p0.d \n\t"
" whilelo p1.d, xzr, x8 \n\t"
" whilelo p2.d, x9, x8 \n\t"
" \n\t"
" ldr x8, %[n_mker] \n\t" // Number of N-loops.
" \n\t"
" ldr x20, %[ai] \n\t" // Parameters to be reloaded
" ldr x21, %[k_mker] \n\t" // within each millikernel loop.
" ldr x22, %[k_left] \n\t"
" ldr x23, %[alpha] \n\t"
" ldr x24, %[beta] \n\t"
" ldr x25, %[a_next] \n\t"
" ldr x26, %[b_next] \n\t"
" ldr x23, [x23] \n\t" // Directly load alpha and beta.
" ldr x24, [x24] \n\t"
" \n\t"
" MILLIKER_MLOOP: \n\t"
" \n\t"
" mov x11, x0 \n\t" // B's address.
// " ldr x10, %[ai] \n\t" // A's address.
" mov x10, x20 \n\t"
// " ldr x12, %[k_mker] \n\t"
" mov x12, x21 \n\t"
// " ldr x13, %[k_left] \n\t"
" mov x13, x22 \n\t"
#ifdef _A64FX
" mov x16, 0x3 \n\t" // Tag A address.
" lsl x16, x16, #56 \n\t"
" orr x10, x10, x16 \n\t"
" mov x16, 0xa \n\t" // Control#2 for A address.
" lsl x16, x16, #60 \n\t"
" orr x10, x10, x16 \n\t"
#endif
" \n\t"
" cmp x12, #0 \n\t" // Don't preload if no microkernel there.
" b.eq END_CCOL_PRFM \n\t"
" \n\t"
" mov x14, x11 \n\t"
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row.
" add x14, x14, x2 \n\t"
" ld1rd z21.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z22.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z23.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z24.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z25.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z26.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z27.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left.
" add x14, x14, x2 \n\t"
" prfm PLDL1KEEP, [x14] \n\t"
" sub x14, x14, x2 \n\t" // Restore x14 to load edge.
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A.
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x10, x4 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" \n\t"
" CCOL_PRFM: \n\t"
" cmp x6, #1 \n\t"
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
" mov x16, x5 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" add x16, x16, x7 \n\t"
" prfm PLDL1STRM, [x16] \n\t"
" END_CCOL_PRFM: \n\t"
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t"
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" \n\t"
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
" add x10, x10, x4 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x13, #0 \n\t" // End of execution.
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
" mov x14, x11 \n\t"
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B.
" add x14, x14, x2 \n\t"
" ld1rd z21.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z22.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z23.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z24.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z25.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z26.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z27.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z28.d, p0/z, [x14] \n\t"
" add x14, x14, x2 \n\t"
" ld1rd z29.d, p0/z, [x14] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x10, x10, x4 \n\t" // Forward A.
" add x11, x11, x1 \n\t" // Forward B.
" sub x13, x13, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
// " ldr x10, %[ai] \n\t"
" mov x10, x20 \n\t"
" add x11, x0, x3 \n\t"
" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors.
" dup z31.d, x24 \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.eq PREFETCH_ABNEXT \n\t"
" prfm PLDL1STRM, [x10] \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" add x11, x11, x2 \n\t"
" prfm PLDL1KEEP, [x11] \n\t"
" b WRITE_MEM \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
" mov x1, x25 \n\t"
// " ldr x2, %[b_next] \n\t"
" mov x2, x26 \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
" prfm PLDL2KEEP, [x2] \n\t"
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov d28, #1.0 \n\t"
" fmov x16, d28 \n\t"
" cmp x16, x23 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" \n\t" // C address for storing is x5 itself.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
" incb x13 \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x12, xzr \n\t"
" incb x12 \n\t"
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" subs x8, x8, #1 \n\t"
" b.eq END_EXEC \n\t"
" \n\t" // Address of C already forwarded to next column.
" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel.
" b MILLIKER_MLOOP \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [bi] "m" (bi),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b] "m" (ps_b),
[cs_a] "m" (cs_a),
[ci] "m" (ci),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[m_curr] "m" (m_curr),
[n_mker] "m" (n_mker),
[ai] "m" (ai),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x10","x11","x12","x13","x14","x15","x16","x17",
"x20","x21","x22","x23","x24","x25","x26",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}
}
void bli_dgemmsup_rv_armsve_10x2v_unindexed
(
conj_t conjat,
conj_t conjbt,
dim_t m0t,
dim_t n0t,
dim_t k0,
double* restrict alpha,
double* restrict at, inc_t rs_at0, inc_t cs_at0,
double* restrict bt, inc_t rs_bt0, inc_t cs_bt0,
double* restrict beta,
double* restrict ct, inc_t rs_ct0, inc_t cs_ct0,
auxinfo_t* restrict datat,
cntx_t* restrict cntx
)
{
auxinfo_t data;
bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
bli_dgemmsup_cv_armsve_2vx10_unindexed
(
conjbt, conjat,
n0t, m0t, k0,
alpha,
bt, cs_bt0, rs_bt0,
at, cs_at0, rs_at0,
beta,
ct, cs_ct0, rs_ct0,
&data,
cntx
);
}

View File

@@ -0,0 +1,412 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
// Double-precision composite instructions.
#include "../armsve_asm_macros_double.h"
// 2vx10 microkernels.
#include "../armsve_asm_2vx10.h"
// Prototype reference kernel.
GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 )
void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
(
conj_t conja,
conj_t conjb,
dim_t m0,
dim_t n0,
dim_t k0,
double* restrict alpha,
double* restrict a, inc_t rs_a0, inc_t cs_a0,
double* restrict b, inc_t rs_b0, inc_t cs_b0,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
static int called = 0;
if ( !called )
{
fprintf(stderr, "rv called.\n");
called = 1;
}
// r*r requires B to be stored in rows.
assert(cs_b0 == 1);
dim_t n0_mker = n0 / 10;
dim_t n0_left = n0 % 10;
if ( n0_left )
{
// A[:, ::]
// B[::, n0_mker*10:n0]
// C[: , n0_mker*10:n0]
double *ai = a;
double *bi = b + n0_mker * 10 * cs_b0;
double *ci = c + n0_mker * 10 * cs_c0;
bli_dgemmsup_r_armsve_ref2
(
conja, conjb,
m0, n0_left, k0,
alpha,
ai, rs_a0, cs_a0,
bi, rs_b0, cs_b0,
beta,
ci, rs_c0, cs_c0,
data,
cntx
);
}
// Return if it's a pure edge case.
if ( !n0_mker )
return;
// Determine VL.
uint64_t vlen2;
__asm__ (
" mov x0, xzr \n\t"
" incd x0, ALL, MUL #2 \n\t"
" mov %[vlen2], x0 \n\t"
: [vlen2] "=r" (vlen2)
:
: "x0"
);
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
uint64_t rs_b = rs_b0;
// uint64_t cs_b = 1;
uint64_t k_mker = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t m_mker = m0 / vlen2;
uint64_t m_left = m0 % vlen2;
if ( m_left )
{
// Edge case on A side can be handled with one more (predicated) loop.
m_mker++;
} else
m_left = vlen2;
uint64_t ps_a = bli_auxinfo_ps_a( data );
// uint64_t ps_b = bli_auxinfo_ps_b( data );
for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
{
double *ai = a;
double *bi = b + in0_mker * 10 * cs_b0;
double *ci = c + in0_mker * 10 * cs_c0;
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
__asm__ volatile (
" ldr x0, %[ai] \n\t"
" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]).
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A.
" ldr x4, %[rs_b] \n\t" // Row-Skip of B.
" \n\t" // Element skip of B[l, :] is guaranteed to be 1.
" ldr x5, %[ci] \n\t"
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
#ifdef _A64FX
" mov x16, 0x1 \n\t" // Tag C address.
" lsl x16, x16, #56 \n\t"
" orr x5, x5, x16 \n\t"
" mov x16, 0x2 \n\t" // Tag A address.
" lsl x16, x16, #56 \n\t"
" orr x0, x0, x16 \n\t"
#endif
" \n\t"
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
" madd x2, x8, x2, xzr \n\t" // cs_a
" madd x3, x8, x3, xzr \n\t" // ps_a
" madd x4, x8, x4, xzr \n\t" // rs_b
" madd x7, x8, x7, xzr \n\t" // cs_c
" mov x8, xzr \n\t"
" incb x8 \n\t"
" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip.
" mov x8, #4 \n\t"
" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A.
// " mov x8, #4 \n\t"
// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B.
" \n\t"
" ldr x8, %[m_mker] \n\t" // Number of M-loops.
" ptrue p0.d \n\t"
" ptrue p1.d \n\t"
" ptrue p2.d \n\t"
" \n\t"
" MILLIKER_MLOOP: \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.ne UKER_BEGIN \n\t"
" \n\t"
" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop.
" mov x11, xzr \n\t"
" incd x11 \n\t"
" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2.
" whilelo p2.d, x11, x10 \n\t"
" \n\t"
" UKER_BEGIN: \n\t"
" mov x10, x0 \n\t" // A's address.
" ldr x11, %[bi] \n\t" // B's address.
" ldr x12, %[k_mker] \n\t"
" ldr x13, %[k_left] \n\t"
#ifdef _A64FX
" mov x16, 0x3 \n\t" // Tag B address.
" lsl x16, x16, #56 \n\t"
" orr x11, x11, x16 \n\t"
#endif
" \n\t"
" mov x16, x11 \n\t" // Prefetch first kernel of B.
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" add x16, x16, x4 \n\t"
" prfm PLDL1KEEP, [x16] \n\t"
" \n\t"
" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row.
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
" \n\t"
" index z29.d, xzr, x1 \n\t" // First A column.
" \n\t" // Skips passed to index is not multiplied by 8.
GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
" \n\t"
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
" \n\t"
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
" b.eq K_LEFT_LOOP \n\t"
" \n\t"
" K_MKER_LOOP: \n\t" // Unroll the 4-loop.
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" index z29.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" \n\t"
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
" \n\t"
" index z29.d, xzr, x1 \n\t"
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" b K_MKER_LOOP \n\t"
" \n\t"
" FIN_MKER_LOOP: \n\t"
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
" add x10, x10, x2 \n\t" // Forward A to fill the blank.
" \n\t"
" K_LEFT_LOOP: \n\t"
" cmp x13, #0 \n\t"
" b.eq WRITE_MEM_PREP \n\t"
" \n\t"
" index z31.d, xzr, x1 \n\t"
GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
" ld1rd z20.d, p0/z, [x11] \n\t"
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
" ld1rd z28.d, p0/z, [x11, #64] \n\t"
" ld1rd z29.d, p0/z, [x11, #72] \n\t"
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
" add x10, x10, x2 \n\t" // Forward A.
" add x11, x11, x4 \n\t" // Forward B.
" sub x13, x13, #1 \n\t"
" b K_LEFT_LOOP \n\t" // Next column / row.
" \n\t"
" WRITE_MEM_PREP: \n\t"
" \n\t"
" ldr x11, %[bi] \n\t"
" ldr x12, %[alpha] \n\t" // Load alpha & beta.
" ldr x13, %[beta] \n\t"
" ld1rd z30.d, p0/z, [x12] \n\t"
" ld1rd z31.d, p0/z, [x13] \n\t"
" ldr x12, [x12] \n\t"
" \n\t"
" cmp x8, #1 \n\t"
" b.eq PREFETCH_ABNEXT \n\t"
" prfm PLDL2STRM, [x11] \n\t"
" b WRITE_MEM \n\t"
" \n\t"
" PREFETCH_ABNEXT: \n\t"
" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
" ldr x2, %[b_next] \n\t"
" prfm PLDL2KEEP, [x1] \n\t"
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
" prfm PLDL2KEEP, [x2] \n\t"
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
" \n\t"
" WRITE_MEM: \n\t"
" \n\t"
" fmov d28, #1.0 \n\t"
" fmov x16, d28 \n\t"
" cmp x16, x12 \n\t"
" b.eq UNIT_ALPHA \n\t"
" \n\t"
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
" \n\t"
" UNIT_ALPHA: \n\t"
" mov x9, x5 \n\t" // C address for loading.
" mov x10, x5 \n\t" // C address for storing.
" cmp x6, #1 \n\t"
" b.ne WRITE_MEM_G \n\t"
" \n\t"
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-29].
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
" incb x13 \n\t"
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
" \n\t"
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
" b END_WRITE_MEM \n\t"
" \n\t"
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
" mov x12, xzr \n\t"
" incb x12 \n\t"
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
" \n\t"
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
" \n\t"
" END_WRITE_MEM: \n\t"
" subs x8, x8, #1 \n\t"
" b.eq END_EXEC \n\t"
" \n\t"
" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel.
" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel.
" add x5, x5, x13 \n\t"
" b MILLIKER_MLOOP \n\t"
" \n\t"
" END_ERROR: \n\t"
" mov x0, #1 \n\t" // Return error.
" END_EXEC: \n\t"
" mov x0, #0 \n\t" // Return normal.
:
: [ai] "m" (ai),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a] "m" (ps_a),
[rs_b] "m" (rs_b),
[ci] "m" (ci),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[m_mker] "m" (m_mker),
[m_left] "m" (m_left),
[bi] "m" (bi),
[k_mker] "m" (k_mker),
[k_left] "m" (k_left),
[alpha] "m" (alpha),
[beta] "m" (beta),
[a_next] "m" (a_next),
[b_next] "m" (b_next)
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
"x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
"z0","z1","z2","z3","z4","z5","z6","z7",
"z8","z9","z10","z11","z12","z13","z14","z15",
"z16","z17","z18","z19",
"z20","z21","z22","z23",
"z24","z25","z26","z27",
"z28","z29","z30","z31"
);
}
}

View File

@@ -33,5 +33,13 @@
*/
GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 )
GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed )
GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed )
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed )
PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_12xk )
PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk )