mirror of
https://github.com/amd/blis.git
synced 2026-06-29 10:47:16 +00:00
Added 512b SVE-based a64fx subconfig + SVE kernels.
Details: - Added 512-bit specific 'a64fx' subconfiguration that uses empirically tuned block size by Stepan Nassyr. This subconfig also sets the sector cache size and enables memory-tagging code in SVE gemm kernels. This subconfig utilizes (16, k) and (10, k) DPACKM kernels. - Added a vector-length agnostic 'armsve' subconfiguration that computes blocksizes according to the analytical model. This part is ported from Stepan Nassyr's repository. - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE at size (2*VL, 10). These kernels use unindexed FMLA instructions because indexed FMLA takes 2 FMA units in many implementations. PS: There are indexed-FLMA kernels in Stepan Nassyr's repository. - Implemented 512-bit SVE dpackm kernels with in-register transpose support for sizes (16, k) and (10, k). - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for size (12, k). This dpackm kernel is not currently used by any subconfiguration. - Implemented several experimental dgemmsup kernels which would improve performance in a few cases. However, those dgemmsup kernels generally underperform hence they are not currently used in any subconfig. - Note: This commit squashes several commits submitted by RuQing Xu via PR #424.
This commit is contained in:
117
config/a64fx/bli_a64fx_sector_cache.h
Normal file
117
config/a64fx/bli_a64fx_sector_cache.h
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// A64FX: set up cache sizes
|
||||
//
|
||||
// Reference: A64FX (TM) specification Fujitsu HPC Extension
|
||||
// Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
|
||||
//
|
||||
// 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 |
|
||||
// RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
|
||||
//
|
||||
// the bits set number of maximum sectors from 0-7
|
||||
// 000 - 0
|
||||
// 001 - 1
|
||||
// 010 - 2
|
||||
// 011 - 3
|
||||
// 100 - 4
|
||||
// 101 - 5
|
||||
// 110 - 6
|
||||
// 111 - 7
|
||||
//
|
||||
// For L1 we want to maximize the number of sectors for B
|
||||
// Configuration 1: 1 sector for C (sector 3)
|
||||
// 1 sector for A (sector 1)
|
||||
// 6 sectors for B (sector 2)
|
||||
// 0 sectors for the rest (sector 0)
|
||||
//
|
||||
// 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
|
||||
//
|
||||
// Configuration 2: 1 sector for C (sector 3)
|
||||
// 1 sector for A (sector 1)
|
||||
// 5 sectors for B (sector 2)
|
||||
// 1 sectors for the rest (sector 0)
|
||||
//
|
||||
// 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
|
||||
//
|
||||
// accessing the control register:
|
||||
//
|
||||
// MRS <Xt>, S3_3_C11_C8_2
|
||||
// MSR S3_3_C11_C8_2, <Xt>
|
||||
//
|
||||
// TODO: First tests showed no change in performance, a deeper investigation
|
||||
// is necessary
|
||||
#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
|
||||
{\
|
||||
uint64_t sector_cache_config = config_bitfield;\
|
||||
__asm__ volatile(\
|
||||
"msr s3_3_c11_c8_2,%[sector_cache_config]"\
|
||||
:\
|
||||
: [sector_cache_config] "r" (sector_cache_config)\
|
||||
:\
|
||||
);\
|
||||
}
|
||||
|
||||
#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
|
||||
{\
|
||||
uint64_t sector_cache_config = config_bitfield;\
|
||||
__asm__ volatile(\
|
||||
"msr s3_3_c15_c8_2,%[sector_cache_config]"\
|
||||
:\
|
||||
: [sector_cache_config] "r" (sector_cache_config)\
|
||||
:\
|
||||
);\
|
||||
}
|
||||
|
||||
|
||||
#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
|
||||
" mov "#sparereg", "#tag" \n\t"\
|
||||
" lsl "#sparereg", "#sparereg", 56 \n\t"\
|
||||
" orr "#areg", "#areg", "#sparereg" \n\t"
|
||||
|
||||
#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
|
||||
__asm__ volatile(\
|
||||
"mrs %["#output_uint64"],s3_3_c11_c8_2"\
|
||||
: [output_uint64] "=r" (output_uint64)\
|
||||
: \
|
||||
:\
|
||||
);
|
||||
|
||||
#define A64FX_SCC(sec0,sec1,sec2,sec3)\
|
||||
(uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
|
||||
|
||||
#define A64FX_SCC_L2(sec02,sec13)\
|
||||
(uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))
|
||||
|
||||
151
config/a64fx/bli_cntx_init_a64fx.c
Normal file
151
config/a64fx/bli_cntx_init_a64fx.c
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "bli_a64fx_sector_cache.h"
|
||||
|
||||
void bli_cntx_init_a64fx( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_a64fx_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set SVE-512 packing routine.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
3,
|
||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
|
||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
4,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Set A64FX cache sector sizes for each PE/CMG
|
||||
// SC Fugaku might disable users' setting cache sizes.
|
||||
#if !defined(CACHE_SECTOR_SIZE_READONLY)
|
||||
#pragma omp parallel
|
||||
{
|
||||
A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
|
||||
A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
46
config/a64fx/bli_family_a64fx.h
Normal file
46
config/a64fx/bli_family_a64fx.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 256
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
82
config/a64fx/make_defs.mk
Normal file
82
config/a64fx/make_defs.mk
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := a64fx
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CKVECFLAGS :=
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
92
config/armsve/bli_armsve_config_utils.c
Normal file
92
config/armsve/bli_armsve_config_utils.c
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
dim_t bli_vl_bits_armsve(void)
|
||||
{ \
|
||||
uint64_t vl = 0;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incb x0 \n\t"
|
||||
" mov %[vl], x0 \n\t"
|
||||
: [vl] "=r" (vl)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
return vl;
|
||||
}
|
||||
|
||||
|
||||
#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
|
||||
void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
|
||||
dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
|
||||
{ \
|
||||
dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
|
||||
dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
|
||||
dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
|
||||
dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
|
||||
dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
|
||||
dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
|
||||
dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
|
||||
dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
|
||||
dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
|
||||
\
|
||||
dim_t vl_b = bli_vl_bits_armsve(); \
|
||||
dim_t vl = vl_b / S_Data; \
|
||||
dim_t m_r = 2 * vl; \
|
||||
dim_t n_r = 10; \
|
||||
\
|
||||
dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
|
||||
/ (n_r * S_Data); \
|
||||
\
|
||||
dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
|
||||
dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
|
||||
m_c -= m_c % m_r; \
|
||||
\
|
||||
dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
|
||||
dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
|
||||
n_c -= n_c % n_r; \
|
||||
\
|
||||
*m_r_ = m_r; \
|
||||
*n_r_ = n_r; \
|
||||
*k_c_ = k_c; \
|
||||
*m_c_ = m_c; \
|
||||
*n_c_ = n_c; \
|
||||
}
|
||||
|
||||
EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
|
||||
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
|
||||
|
||||
42
config/armsve/bli_armsve_config_utils.h
Normal file
42
config/armsve/bli_armsve_config_utils.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
dim_t bli_vl_bits_armsve(void);
|
||||
|
||||
void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
|
||||
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
|
||||
|
||||
157
config/armsve/bli_cntx_init_armsve.c
Normal file
157
config/armsve/bli_cntx_init_armsve.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "bli_armsve_config_utils.h"
|
||||
|
||||
void bli_cntx_init_armsve( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
#if 0
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
#endif
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_armsve_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Block size.
|
||||
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
|
||||
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
|
||||
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
|
||||
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set VL-specific packing routines if applicable.
|
||||
if (m_r_d==16)
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
3,
|
||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
|
||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||
cntx
|
||||
);
|
||||
else if (m_r_d==8)
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
4,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
56
config/armsve/bli_family_armsve.h
Normal file
56
config/armsve/bli_family_armsve.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 256
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
// SVE-specific configs.
|
||||
#define N_L1_SVE_DEFAULT 64
|
||||
#define W_L1_SVE_DEFAULT 4
|
||||
#define C_L1_SVE_DEFAULT 256
|
||||
#define N_L2_SVE_DEFAULT 2048
|
||||
#define W_L2_SVE_DEFAULT 16
|
||||
#define C_L2_SVE_DEFAULT 256
|
||||
#define N_L3_SVE_DEFAULT 8192
|
||||
#define W_L3_SVE_DEFAULT 16
|
||||
#define C_L3_SVE_DEFAULT 256
|
||||
|
||||
//#endif
|
||||
|
||||
82
config/armsve/make_defs.mk
Normal file
82
config/armsve/make_defs.mk
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := armsve
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS := -D_GNU_SOURCE
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CKVECFLAGS :=
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -32,6 +32,8 @@ piledriver: piledriver
|
||||
bulldozer: bulldozer
|
||||
|
||||
# ARM architectures.
|
||||
armsve: armsve/armsve
|
||||
a64fx: a64fx/armsve
|
||||
thunderx2: thunderx2/armv8a
|
||||
cortexa57: cortexa57/armv8a
|
||||
cortexa53: cortexa53/armv8a
|
||||
|
||||
@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
|
||||
#endif
|
||||
|
||||
// ARM microarchitectures.
|
||||
#ifdef BLIS_FAMILY_ARMSVE
|
||||
id = BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_A64FX
|
||||
id = BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_THUNDERX2
|
||||
id = BLIS_ARCH_THUNDERX2;
|
||||
#endif
|
||||
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
|
||||
"thunderx2",
|
||||
"cortexa57",
|
||||
"cortexa53",
|
||||
"armsve",
|
||||
"a64fx",
|
||||
"cortexa15",
|
||||
"cortexa9",
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
|
||||
printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" );
|
||||
printf("family = %x\n", family );
|
||||
printf( "model = %x\n", model );
|
||||
|
||||
|
||||
printf( "features = %x\n", features );
|
||||
#endif
|
||||
|
||||
@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
|
||||
{
|
||||
// Check for each ARMv8 configuration that is enabled, check for that
|
||||
// microarchitecture. We check from most recent to most dated.
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
if ( bli_cpuid_is_armsve( model, part, features ) )
|
||||
return BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
if ( bli_cpuid_is_a64fx( model, part, features ) )
|
||||
return BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
if ( bli_cpuid_is_thunderx2( model, part, features ) )
|
||||
return BLIS_ARCH_THUNDERX2;
|
||||
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_armsve
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_a64fx
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa15
|
||||
(
|
||||
uint32_t family,
|
||||
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
|
||||
strstr( feat_str, "asimd" ) != NULL )
|
||||
*features |= FEATURE_NEON;
|
||||
|
||||
// Parse the feature string to check for SVE features.
|
||||
if ( strstr( feat_str, "sve" ) != NULL )
|
||||
*features |= FEATURE_SVE;
|
||||
|
||||
//printf( "bli_cpuid_query(): features var: %u\n", *features );
|
||||
|
||||
// Parse the processor string to uncover the model.
|
||||
|
||||
@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
|
||||
bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
|
||||
bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
|
||||
|
||||
@@ -175,7 +177,8 @@ enum
|
||||
};
|
||||
enum
|
||||
{
|
||||
FEATURE_NEON = 0x1
|
||||
FEATURE_NEON = 0x01,
|
||||
FEATURE_SVE = 0x02
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -144,6 +144,16 @@ void bli_gks_init( void )
|
||||
bli_cntx_init_cortexa53_ref,
|
||||
bli_cntx_init_cortexa53_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve,
|
||||
bli_cntx_init_armsve_ref,
|
||||
bli_cntx_init_armsve_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
|
||||
bli_cntx_init_a64fx_ref,
|
||||
bli_cntx_init_a64fx_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_CORTEXA15
|
||||
bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15,
|
||||
bli_cntx_init_cortexa15_ref,
|
||||
|
||||
@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )
|
||||
|
||||
// -- ARM architectures --
|
||||
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
CNTX_INIT_PROTS( armsve )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
CNTX_INIT_PROTS( a64fx )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
CNTX_INIT_PROTS( thunderx2 )
|
||||
#endif
|
||||
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- ARM architectures --
|
||||
|
||||
#ifdef BLIS_FAMILY_ARMSVE
|
||||
#include "bli_family_armsve.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_A64FX
|
||||
#include "bli_family_a64fx.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_THUNDERX2
|
||||
#include "bli_family_thunderx2.h"
|
||||
#endif
|
||||
|
||||
@@ -1005,6 +1005,8 @@ typedef enum
|
||||
BLIS_ARCH_BULLDOZER,
|
||||
|
||||
// ARM
|
||||
BLIS_ARCH_ARMSVE,
|
||||
BLIS_ARCH_A64FX,
|
||||
BLIS_ARCH_THUNDERX2,
|
||||
BLIS_ARCH_CORTEXA57,
|
||||
BLIS_ARCH_CORTEXA53,
|
||||
@@ -1029,7 +1031,7 @@ typedef enum
|
||||
|
||||
// NOTE: This value must be updated to reflect the number of enum values
|
||||
// listed above for arch_t!
|
||||
#define BLIS_NUM_ARCHS 22
|
||||
//#define BLIS_NUM_ARCHS 25
|
||||
|
||||
|
||||
//
|
||||
|
||||
45
kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
Normal file
45
kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
|
||||
"trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
|
||||
"trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
|
||||
"compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
|
||||
"compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
|
||||
"compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
|
||||
"compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
|
||||
"compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
|
||||
"compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"
|
||||
|
||||
97
kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
Normal file
97
kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
|
||||
"ptrue " #PT".d \n\t" \
|
||||
"mov " #XTMP", #2 \n\t" \
|
||||
"whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
|
||||
"mov " #XTMP", #4 \n\t" \
|
||||
"whilelo " #P4".d, xzr, " #XTMP" \n\t" \
|
||||
"mov " #XTMP", #6 \n\t" \
|
||||
"whilelo " #P6".d, xzr, " #XTMP" \n\t" \
|
||||
\
|
||||
"eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
|
||||
"orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
|
||||
\
|
||||
"not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
|
||||
"not " #P4C".b, " #PT"/z, " #P4".b \n\t" \
|
||||
"not " #P6C".b, " #PT"/z, " #P6".b \n\t" \
|
||||
|
||||
#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
|
||||
"trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
|
||||
"trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
|
||||
"trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
|
||||
"trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
|
||||
"trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
|
||||
"trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
|
||||
"trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
|
||||
"trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
|
||||
\
|
||||
"compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
|
||||
"compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
|
||||
"ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
|
||||
"ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
|
||||
"compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
|
||||
"compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
|
||||
"ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
|
||||
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
|
||||
\
|
||||
"sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
|
||||
"sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
|
||||
"sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
|
||||
"sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
|
||||
"sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
|
||||
"sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
|
||||
"sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
|
||||
"sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
|
||||
\
|
||||
"compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
|
||||
"compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
|
||||
"compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
|
||||
"compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
|
||||
"ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
|
||||
"ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
|
||||
"ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
|
||||
"ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
|
||||
\
|
||||
"sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
|
||||
"sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
|
||||
"sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
|
||||
"sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
|
||||
"sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
|
||||
"sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
|
||||
"sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
|
||||
"sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"
|
||||
|
||||
@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
double* a = ( double* )a_;
|
||||
double* p = ( double* )p_;
|
||||
double* kappa = ( double* )kappa_;
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 8;
|
||||
const int64_t n = n_;
|
||||
|
||||
365
kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
Normal file
365
kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
Normal file
@@ -0,0 +1,365 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "armsve512_asm_transpose_d8x8.h"
|
||||
#include "armsve512_asm_transpose_d8x2.h"
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
|
||||
void bli_dpackm_armsve512_asm_10xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 10;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
const bool gs = inca != 1 && lda != 1;
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( cdim == mnr && !gs && unitk )
|
||||
{
|
||||
uint64_t n_mker = n / 8;
|
||||
uint64_t n_left = n % 8;
|
||||
__asm__ volatile (
|
||||
"mov x0, %[a] \n\t"
|
||||
"mov x1, %[p] \n\t"
|
||||
"mov x2, %[ldp] \n\t"
|
||||
"mov x3, %[lda] \n\t"
|
||||
"mov x4, %[inca] \n\t"
|
||||
"cmp x4, #1 \n\t"
|
||||
// Skips by sizeof(double).
|
||||
"mov x8, #8 \n\t"
|
||||
"madd x2, x2, x8, xzr \n\t"
|
||||
"madd x3, x3, x8, xzr \n\t"
|
||||
"madd x4, x4, x8, xzr \n\t"
|
||||
// Loop constants.
|
||||
"mov x8, %[n_mker] \n\t"
|
||||
"mov x9, %[n_left] \n\t"
|
||||
"ptrue p0.d \n\t"
|
||||
"b.ne .AROWSTOR \n\t"
|
||||
// A stored in columns.
|
||||
" .ACOLSTOR: \n\t"
|
||||
// Prefetch distance.
|
||||
"mov x17, #8 \n\t"
|
||||
"madd x17, x17, x3, xzr \n\t"
|
||||
#ifdef _A64FX
|
||||
// Disable hardware prefetch for A.
|
||||
"mov x16, 0x6 \n\t"
|
||||
"lsl x16, x16, #60 \n\t"
|
||||
"orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" .ACOLSTORMKER: \n\t"
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .ACOLSTORMKEREND \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ldr q1, [x0, #64] \n\t"
|
||||
"ld1d z2.d, p0/z, [x5] \n\t"
|
||||
"ldr q3, [x5, #64] \n\t"
|
||||
"ld1d z4.d, p0/z, [x6] \n\t"
|
||||
"ldr q5, [x6, #64] \n\t"
|
||||
"ld1d z6.d, p0/z, [x7] \n\t"
|
||||
"ldr q7, [x7, #64] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z8.d, p0/z, [x0] \n\t"
|
||||
"ldr q9, [x0, #64] \n\t"
|
||||
"ld1d z10.d, p0/z, [x5] \n\t"
|
||||
"ldr q11, [x5, #64] \n\t"
|
||||
"ld1d z12.d, p0/z, [x6] \n\t"
|
||||
"ldr q13, [x6, #64] \n\t"
|
||||
"ld1d z14.d, p0/z, [x7] \n\t"
|
||||
"ldr q15, [x7, #64] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
// Plain storage
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"st1d z2.d, p0, [x10] \n\t"
|
||||
"str q3, [x10, #64] \n\t"
|
||||
"st1d z4.d, p0, [x11] \n\t"
|
||||
"str q5, [x11, #64] \n\t"
|
||||
"st1d z6.d, p0, [x12] \n\t"
|
||||
"str q7, [x12, #64] \n\t"
|
||||
"st1d z8.d, p0, [x13] \n\t"
|
||||
"str q9, [x13, #64] \n\t"
|
||||
"st1d z10.d, p0, [x14] \n\t"
|
||||
"str q11, [x14, #64] \n\t"
|
||||
"st1d z12.d, p0, [x15] \n\t"
|
||||
"str q13, [x15, #64] \n\t"
|
||||
"st1d z14.d, p0, [x16] \n\t"
|
||||
"str q15, [x16, #64] \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
// Realign and store.
|
||||
// "ext z1.b, z1.b, z1.b, #16 \n\t"
|
||||
// "ext z1.b, z1.b, z2.b, #48 \n\t"
|
||||
// "ext z2.b, z2.b, z3.b, #16 \n\t"
|
||||
// "ext z2.b, z2.b, z4.b, #32 \n\t"
|
||||
// "ext z4.b, z4.b, z5.b, #16 \n\t"
|
||||
// "ext z4.b, z4.b, z6.b, #16 \n\t"
|
||||
// "ext z6.b, z6.b, z7.b, #16 \n\t"
|
||||
// "ext z9.b, z9.b, z9.b, #16 \n\t"
|
||||
// "ext z9.b, z9.b, z10.b, #48 \n\t"
|
||||
// "ext z10.b, z10.b, z11.b, #16 \n\t"
|
||||
// "ext z10.b, z10.b, z12.b, #32 \n\t"
|
||||
// "ext z12.b, z12.b, z13.b, #16 \n\t"
|
||||
// "ext z12.b, z12.b, z14.b, #16 \n\t"
|
||||
// "ext z14.b, z14.b, z15.b, #16 \n\t"
|
||||
// "st1d z0.d, p0, [x1] \n\t"
|
||||
// "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
// "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
|
||||
// "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
|
||||
// "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
|
||||
// "add x1, x1, #320 \n\t"
|
||||
// "st1d z8.d, p0, [x1] \n\t"
|
||||
// "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
|
||||
// "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
|
||||
// "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
|
||||
// "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
|
||||
// "add x1, x1, #320 \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .ACOLSTORMKER \n\t"
|
||||
" .ACOLSTORMKEREND: \n\t"
|
||||
" .ACOLSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ldr q1, [x0, #64] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"add x0, x0, x3 \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .ACOLSTORLEFT \n\t"
|
||||
// A stored in rows.
|
||||
" .AROWSTOR: \n\t"
|
||||
// Prepare predicates for in-reg transpose.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
|
||||
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .AROWSTORMKEREND \n\t"
|
||||
"add x10, x0, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"add x17, x16, x4 \n\t"
|
||||
"add x18, x17, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x10] \n\t"
|
||||
"ld1d z2.d, p0/z, [x11] \n\t"
|
||||
"ld1d z3.d, p0/z, [x12] \n\t"
|
||||
"ld1d z4.d, p0/z, [x13] \n\t"
|
||||
"ld1d z5.d, p0/z, [x14] \n\t"
|
||||
"ld1d z6.d, p0/z, [x15] \n\t"
|
||||
"ld1d z7.d, p0/z, [x16] \n\t"
|
||||
"ld1d z22.d, p0/z, [x17] \n\t"
|
||||
"ld1d z23.d, p0/z, [x18] \n\t"
|
||||
// Transpose first 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
|
||||
// Transpose last 2 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
|
||||
// Plain storage.
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z8.d, p0, [x1] \n\t"
|
||||
"str q16, [x1, #64] \n\t"
|
||||
"st1d z9.d, p0, [x10] \n\t"
|
||||
"str q17, [x10, #64] \n\t"
|
||||
"st1d z10.d, p0, [x11] \n\t"
|
||||
"str q18, [x11, #64] \n\t"
|
||||
"st1d z11.d, p0, [x12] \n\t"
|
||||
"str q19, [x12, #64] \n\t"
|
||||
"st1d z12.d, p0, [x13] \n\t"
|
||||
"str q20, [x13, #64] \n\t"
|
||||
"st1d z13.d, p0, [x14] \n\t"
|
||||
"str q21, [x14, #64] \n\t"
|
||||
"st1d z14.d, p0, [x15] \n\t"
|
||||
"str q22, [x15, #64] \n\t"
|
||||
"st1d z15.d, p0, [x16] \n\t"
|
||||
"str q23, [x16, #64] \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"add x0, x0, #64 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .AROWSTORMKER \n\t"
|
||||
" .AROWSTORMKEREND: \n\t"
|
||||
"mov x4, %[inca] \n\t" // Restore unshifted inca.
|
||||
"index z30.d, xzr, x4 \n\t" // Generate index.
|
||||
"lsl x4, x4, #3 \n\t" // Shift again.
|
||||
"lsl x5, x4, #3 \n\t" // Virtual column vl.
|
||||
" .AROWSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"add x6, x0, x5 \n\t"
|
||||
"add x7, x6, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
|
||||
"ldr d1, [x6] \n\t"
|
||||
"ldr d2, [x7] \n\t"
|
||||
"trn1 v1.2d, v1.2d, v2.2d \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"str q1, [x1, #64] \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"add x0, x0, #8 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .AROWSTORLEFT \n\t"
|
||||
" .UNITKDONE: \n\t"
|
||||
"mov x0, #0 \n\t"
|
||||
:
|
||||
: [a] "r" (a),
|
||||
[p] "r" (p),
|
||||
[lda] "r" (lda),
|
||||
[ldp] "r" (ldp),
|
||||
[inca] "r" (inca),
|
||||
[n_mker] "r" (n_mker),
|
||||
[n_left] "r" (n_left)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14","x15",
|
||||
"x16","x17","x18",
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
|
||||
"z8", "z9", "z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19","z20","z21","z22","z23",
|
||||
// "z24","z25","z26","z27","z28","z29",
|
||||
"z30","z31",
|
||||
"p0", "p1", "p2", "p3", "p4", // "p5",
|
||||
"p6", "p7", "p8"
|
||||
);
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
359
kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
Normal file
359
kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
Normal file
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#else
|
||||
#error "No Arm SVE intrinsics support in compiler"
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
// TODO:
|
||||
// 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
|
||||
// prefetching is needed.
|
||||
|
||||
void bli_dpackm_armsve512_asm_12xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 12;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
double* restrict alpha1 = a;
|
||||
double* restrict alpha1_8 = alpha1 + 8 * inca;
|
||||
double* restrict alpha1_p4 = alpha1 + 4 * inca;
|
||||
double* restrict alpha1_m4 = alpha1 - 4 * inca;
|
||||
double* restrict pi1 = p;
|
||||
const svbool_t all_active = svptrue_b64();
|
||||
const svbool_t first_half_active = svwhilelt_b64(0, 4);
|
||||
const svbool_t last_half_active = svnot_z(all_active, first_half_active);
|
||||
svfloat64_t z_a0;
|
||||
svfloat64_t z_a8;
|
||||
svfloat64_t z_a8_lh;
|
||||
svfloat64_t z_a16;
|
||||
svuint64_t z_index;
|
||||
|
||||
// creating index for gather/scatter
|
||||
// with each element as: 0, 1*inca, 2*inca, 3*inca
|
||||
z_index = svindex_u64( 0, inca * sizeof( double ) );
|
||||
|
||||
if ( cdim == mnr )
|
||||
{
|
||||
if ( bli_deq1( *kappa ) )
|
||||
{
|
||||
if ( inca == 1 ) // continous memory. packA style
|
||||
{
|
||||
dim_t k = n;
|
||||
// 2 pack into 3 case.
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// load 12 continuous elments from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_f64( all_active, alpha1_p4 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
// line-by-line packing case.
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// store them into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
else // gather/scatter load/store. packB style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// gather load from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// scatter store into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // *kappa != 1.0
|
||||
{
|
||||
// load kappa into vector
|
||||
svfloat64_t z_kappa;
|
||||
|
||||
z_kappa = svdup_f64( *kappa );
|
||||
|
||||
if ( inca == 1 ) // continous memory. packA style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// load 12 continuous elments from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_f64( all_active, alpha1_p4 );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// load 12 continuous elments from *a
|
||||
z_a0 = svld1_f64( all_active, alpha1 );
|
||||
z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
|
||||
// store them into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
else // gather/scatter load/store. packB style
|
||||
{
|
||||
dim_t k = n;
|
||||
if ( ldp == mnr )
|
||||
{
|
||||
for ( ; k > 1; k -= 2 )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// forward address - 0 to 1
|
||||
alpha1 += lda;
|
||||
alpha1_p4 = alpha1 + 4 * inca;
|
||||
alpha1_m4 = alpha1 - 4 * inca;
|
||||
|
||||
// gather load from *a, filling last half of z8.
|
||||
z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
|
||||
z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
|
||||
z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
|
||||
|
||||
// stored packed data into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( all_active, pi1, 1, z_a8 );
|
||||
svst1_vnum_f64( all_active, pi1, 2, z_a16 );
|
||||
|
||||
// forward address - 1 to 0
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += 2 * ldp;
|
||||
}
|
||||
}
|
||||
for ( ; k != 0; --k )
|
||||
{
|
||||
// gather load from *a
|
||||
z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
|
||||
z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
|
||||
|
||||
// multiply by *kappa
|
||||
z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
|
||||
z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
|
||||
|
||||
// scatter store into *p
|
||||
svst1_f64( all_active, pi1, z_a0 );
|
||||
svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
|
||||
|
||||
alpha1 += lda;
|
||||
alpha1_8 = alpha1 + 8 * inca;
|
||||
pi1 += ldp;
|
||||
}
|
||||
}
|
||||
} // end of if ( *kappa == 1.0 )
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
363
kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
Normal file
363
kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
Normal file
@@ -0,0 +1,363 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "armsve512_asm_transpose_d8x8.h"
|
||||
|
||||
// assumption:
|
||||
// SVE vector length = 512 bits.
|
||||
|
||||
void bli_dpackm_armsve512_asm_16xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim_,
|
||||
dim_t n_,
|
||||
dim_t n_max_,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca_, inc_t lda_,
|
||||
double* restrict p, inc_t ldp_,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const int64_t cdim = cdim_;
|
||||
const int64_t mnr = 16;
|
||||
const int64_t n = n_;
|
||||
const int64_t n_max = n_max_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
const bool gs = inca != 1 && lda != 1;
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( cdim == mnr && !gs && unitk )
|
||||
{
|
||||
uint64_t n_mker = n / 8;
|
||||
uint64_t n_left = n % 8;
|
||||
__asm__ volatile (
|
||||
"mov x0, %[a] \n\t"
|
||||
"mov x1, %[p] \n\t"
|
||||
"mov x2, %[ldp] \n\t"
|
||||
"mov x3, %[lda] \n\t"
|
||||
"mov x4, %[inca] \n\t"
|
||||
"cmp x4, #1 \n\t"
|
||||
// Skips by sizeof(double).
|
||||
"mov x8, #8 \n\t"
|
||||
"madd x2, x2, x8, xzr \n\t"
|
||||
"madd x3, x3, x8, xzr \n\t"
|
||||
"madd x4, x4, x8, xzr \n\t"
|
||||
|
||||
// "mov x8, 0x8 \n\t" // Control#0 for A address.
|
||||
// "mov x8, 0x24 \n\t" // Higher 6bit for Control#0:
|
||||
// "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
|
||||
// "orr x8, x8, x3 \n\t" // Stride.
|
||||
// "msr S3_3_C11_C6_0, x8 \n\t" // Write system register.
|
||||
|
||||
// Loop constants.
|
||||
"mov x8, %[n_mker] \n\t"
|
||||
"mov x9, %[n_left] \n\t"
|
||||
"ptrue p0.d \n\t"
|
||||
"b.ne .AROWSTOR \n\t"
|
||||
// A stored in columns.
|
||||
" .ACOLSTOR: \n\t"
|
||||
// Prefetch distance.
|
||||
"mov x17, #8 \n\t"
|
||||
"madd x17, x17, x3, xzr \n\t"
|
||||
#ifdef _A64FX
|
||||
"mov x16, 0x6 \n\t" // Disable hardware prefetch for A.
|
||||
"lsl x16, x16, #60 \n\t"
|
||||
"orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
// "add x5, x0, x3 \n\t"
|
||||
// "add x6, x5, x3 \n\t"
|
||||
// "add x7, x6, x3 \n\t"
|
||||
// "prfm PLDL1STRM, [x0] \n\t"
|
||||
// "prfm PLDL1STRM, [x5] \n\t"
|
||||
// "prfm PLDL1STRM, [x6] \n\t"
|
||||
// "prfm PLDL1STRM, [x7] \n\t"
|
||||
// "add x18, x7, x3 \n\t"
|
||||
// "add x5, x18, x3 \n\t"
|
||||
// "add x6, x5, x3 \n\t"
|
||||
// "add x7, x6, x3 \n\t"
|
||||
// "prfm PLDL1STRM, [x18] \n\t"
|
||||
// "prfm PLDL1STRM, [x5] \n\t"
|
||||
// "prfm PLDL1STRM, [x6] \n\t"
|
||||
// "prfm PLDL1STRM, [x7] \n\t"
|
||||
" .ACOLSTORMKER: \n\t"
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .ACOLSTORMKEREND \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"ld1d z2.d, p0/z, [x5] \n\t"
|
||||
"ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
|
||||
"ld1d z4.d, p0/z, [x6] \n\t"
|
||||
"ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
|
||||
"ld1d z6.d, p0/z, [x7] \n\t"
|
||||
"ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x5, x0, x3 \n\t"
|
||||
"add x6, x5, x3 \n\t"
|
||||
"add x7, x6, x3 \n\t"
|
||||
"ld1d z8.d, p0/z, [x0] \n\t"
|
||||
"ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"ld1d z10.d, p0/z, [x5] \n\t"
|
||||
"ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
|
||||
"ld1d z12.d, p0/z, [x6] \n\t"
|
||||
"ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
|
||||
"ld1d z14.d, p0/z, [x7] \n\t"
|
||||
"ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
|
||||
"add x18, x17, x0 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x5 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x6 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"add x18, x17, x7 \n\t"
|
||||
"prfm PLDL1STRM, [x18] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"st1d z2.d, p0, [x10] \n\t"
|
||||
"st1d z3.d, p0, [x10, #1, mul vl] \n\t"
|
||||
"st1d z4.d, p0, [x11] \n\t"
|
||||
"st1d z5.d, p0, [x11, #1, mul vl] \n\t"
|
||||
"st1d z6.d, p0, [x12] \n\t"
|
||||
"st1d z7.d, p0, [x12, #1, mul vl] \n\t"
|
||||
"st1d z8.d, p0, [x13] \n\t"
|
||||
"st1d z9.d, p0, [x13, #1, mul vl] \n\t"
|
||||
"st1d z10.d, p0, [x14] \n\t"
|
||||
"st1d z11.d, p0, [x14, #1, mul vl] \n\t"
|
||||
"st1d z12.d, p0, [x15] \n\t"
|
||||
"st1d z13.d, p0, [x15, #1, mul vl] \n\t"
|
||||
"st1d z14.d, p0, [x16] \n\t"
|
||||
"st1d z15.d, p0, [x16, #1, mul vl] \n\t"
|
||||
"add x0, x7, x3 \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .ACOLSTORMKER \n\t"
|
||||
" .ACOLSTORMKEREND: \n\t"
|
||||
" .ACOLSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"add x0, x0, x3 \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .ACOLSTORLEFT \n\t"
|
||||
// A stored in rows.
|
||||
" .AROWSTOR: \n\t"
|
||||
// Prepare predicates for in-reg transpose.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
|
||||
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
|
||||
"cmp x8, xzr \n\t"
|
||||
"b.eq .AROWSTORMKEREND \n\t"
|
||||
"add x10, x0, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0] \n\t"
|
||||
"ld1d z1.d, p0/z, [x10] \n\t"
|
||||
"ld1d z2.d, p0/z, [x11] \n\t"
|
||||
"ld1d z3.d, p0/z, [x12] \n\t"
|
||||
"ld1d z4.d, p0/z, [x13] \n\t"
|
||||
"ld1d z5.d, p0/z, [x14] \n\t"
|
||||
"ld1d z6.d, p0/z, [x15] \n\t"
|
||||
"ld1d z7.d, p0/z, [x16] \n\t"
|
||||
"add x5, x16, x4 \n\t"
|
||||
"add x10, x5, x4 \n\t"
|
||||
"add x11, x10, x4 \n\t"
|
||||
"add x12, x11, x4 \n\t"
|
||||
"add x13, x12, x4 \n\t"
|
||||
"add x14, x13, x4 \n\t"
|
||||
"add x15, x14, x4 \n\t"
|
||||
"add x16, x15, x4 \n\t"
|
||||
"ld1d z16.d, p0/z, [x5] \n\t"
|
||||
"ld1d z17.d, p0/z, [x10] \n\t"
|
||||
"ld1d z18.d, p0/z, [x11] \n\t"
|
||||
"ld1d z19.d, p0/z, [x12] \n\t"
|
||||
"ld1d z20.d, p0/z, [x13] \n\t"
|
||||
"ld1d z21.d, p0/z, [x14] \n\t"
|
||||
"ld1d z22.d, p0/z, [x15] \n\t"
|
||||
"ld1d z23.d, p0/z, [x16] \n\t"
|
||||
// Transpose first 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
|
||||
// Transpose last 8 rows.
|
||||
SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
|
||||
"add x10, x1, x2 \n\t"
|
||||
"add x11, x10, x2 \n\t"
|
||||
"add x12, x11, x2 \n\t"
|
||||
"add x13, x12, x2 \n\t"
|
||||
"add x14, x13, x2 \n\t"
|
||||
"add x15, x14, x2 \n\t"
|
||||
"add x16, x15, x2 \n\t"
|
||||
"st1d z8.d, p0, [x1] \n\t"
|
||||
"st1d z24.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"st1d z9.d, p0, [x10] \n\t"
|
||||
"st1d z25.d, p0, [x10, #1, mul vl] \n\t"
|
||||
"st1d z10.d, p0, [x11] \n\t"
|
||||
"st1d z26.d, p0, [x11, #1, mul vl] \n\t"
|
||||
"st1d z11.d, p0, [x12] \n\t"
|
||||
"st1d z27.d, p0, [x12, #1, mul vl] \n\t"
|
||||
"st1d z12.d, p0, [x13] \n\t"
|
||||
"st1d z28.d, p0, [x13, #1, mul vl] \n\t"
|
||||
"st1d z13.d, p0, [x14] \n\t"
|
||||
"st1d z29.d, p0, [x14, #1, mul vl] \n\t"
|
||||
"st1d z14.d, p0, [x15] \n\t"
|
||||
"st1d z30.d, p0, [x15, #1, mul vl] \n\t"
|
||||
"st1d z15.d, p0, [x16] \n\t"
|
||||
"st1d z31.d, p0, [x16, #1, mul vl] \n\t"
|
||||
"add x0, x0, #64 \n\t"
|
||||
"add x1, x16, x2 \n\t"
|
||||
"sub x8, x8, #1 \n\t"
|
||||
"b .AROWSTORMKER \n\t"
|
||||
" .AROWSTORMKEREND: \n\t"
|
||||
"mov x4, %[inca] \n\t" // Restore unshifted inca.
|
||||
"index z30.d, xzr, x4 \n\t" // Generate index.
|
||||
"lsl x4, x4, #3 \n\t" // Shift again.
|
||||
"lsl x5, x4, #3 \n\t" // Virtual column vl.
|
||||
" .AROWSTORLEFT: \n\t"
|
||||
"cmp x9, xzr \n\t"
|
||||
"b.eq .UNITKDONE \n\t"
|
||||
"add x6, x0, x5 \n\t"
|
||||
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
|
||||
"ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
|
||||
"st1d z0.d, p0, [x1] \n\t"
|
||||
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
|
||||
"add x1, x1, x2 \n\t"
|
||||
"add x0, x0, #8 \n\t"
|
||||
"sub x9, x9, #1 \n\t"
|
||||
"b .AROWSTORLEFT \n\t"
|
||||
" .UNITKDONE: \n\t"
|
||||
"mov x0, #0 \n\t"
|
||||
:
|
||||
: [a] "r" (a),
|
||||
[p] "r" (p),
|
||||
[lda] "r" (lda),
|
||||
[ldp] "r" (ldp),
|
||||
[inca] "r" (inca),
|
||||
[n_mker] "r" (n_mker),
|
||||
[n_left] "r" (n_left)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14","x15",
|
||||
"x16","x17","x18",
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
|
||||
"z8", "z9", "z10","z11","z12","z13","z14","z15",
|
||||
// "z16","z17","z18","z19","z20","z21","z22","z23",
|
||||
// "z24","z25","z26","z27","z28","z29","z30","z31",
|
||||
"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
|
||||
);
|
||||
}
|
||||
else // if ( cdim < mnr )
|
||||
{
|
||||
bli_dscal2m_ex
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim,
|
||||
n,
|
||||
kappa,
|
||||
a, inca, lda,
|
||||
p, 1, ldp,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
// if ( cdim < mnr )
|
||||
{
|
||||
const dim_t i = cdim;
|
||||
const dim_t m_edge = mnr - i;
|
||||
const dim_t n_edge = n_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( n < n_max )
|
||||
{
|
||||
const dim_t j = n;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = n_max - j;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
191
kernels/armsve/3/armsve_asm_2vx10.h
Normal file
191
kernels/armsve/3/armsve_asm_2vx10.h
Normal file
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
|
||||
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
|
||||
GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
|
||||
GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
|
||||
GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
|
||||
GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
|
||||
GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
|
||||
\
|
||||
GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
|
||||
GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
|
||||
|
||||
// Second through forth microkernels are the first one with B vectors rotated.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
|
||||
// NOTE:
|
||||
// The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
|
||||
// (sth. akin to loop-invariant):
|
||||
// - BV[0-7] holds B[0:7, 4*k_cur]
|
||||
// - B's address stops at B[0, 4*k_cur+1]
|
||||
|
||||
// Final loop inside K=4 microkernels.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
|
||||
GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
|
||||
GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
|
||||
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
|
||||
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
|
||||
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
|
||||
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
|
||||
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
|
||||
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
|
||||
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
|
||||
|
||||
// K=4 MKer loop with B memory scattered.
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
" mov "#BELMADDR", "#BADDR" \n\t" \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
|
||||
\
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
|
||||
|
||||
#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
|
||||
" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \
|
||||
" mov "#BELMADDR", "#BADDR" \n\t" \
|
||||
GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
|
||||
GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
|
||||
GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
|
||||
GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
|
||||
GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
|
||||
GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
|
||||
GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
|
||||
GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
|
||||
|
||||
|
||||
#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
|
||||
CLEAR_COL4(Z00,Z01,Z02,Z03) \
|
||||
CLEAR_COL4(Z04,Z05,Z06,Z07) \
|
||||
CLEAR_COL4(Z08,Z09,Z10,Z11) \
|
||||
CLEAR_COL4(Z12,Z13,Z14,Z15) \
|
||||
CLEAR_COL4(Z16,Z17,Z18,Z19)
|
||||
|
||||
#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
|
||||
SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
|
||||
SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
|
||||
SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
|
||||
SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
|
||||
SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
|
||||
|
||||
#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
|
||||
|
||||
#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
|
||||
|
||||
#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
|
||||
|
||||
123
kernels/armsve/3/armsve_asm_macros.h
Normal file
123
kernels/armsve/3/armsve_asm_macros.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#define CLEAR_COL2(Z0,Z1) \
|
||||
" dup "#Z0"."DT", #0 \n\t" \
|
||||
" dup "#Z1"."DT", #0 \n\t"
|
||||
|
||||
#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
|
||||
CLEAR_COL2(Z0,Z1) \
|
||||
CLEAR_COL2(Z2,Z3)
|
||||
|
||||
#define SCALE_COL2(Z0,Z1,ZFACTOR) \
|
||||
" fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
" fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
|
||||
#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
|
||||
SCALE_COL2(Z0,Z1,ZFACTOR) \
|
||||
SCALE_COL2(Z2,Z3,ZFACTOR)
|
||||
|
||||
// Prefetch or not.
|
||||
#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
|
||||
#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
|
||||
" prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
|
||||
|
||||
#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
|
||||
" fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
|
||||
|
||||
#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
|
||||
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
|
||||
|
||||
#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
|
||||
GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
|
||||
" add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
|
||||
|
||||
#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
|
||||
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \
|
||||
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
|
||||
|
||||
#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
|
||||
" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
|
||||
" "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
|
||||
|
||||
// Prefetch or not.
|
||||
#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
|
||||
#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
|
||||
" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
|
||||
" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
|
||||
" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
|
||||
|
||||
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
|
||||
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
|
||||
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
|
||||
PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
|
||||
|
||||
#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
|
||||
" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \
|
||||
GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
|
||||
" add "#ATEMP", "#AADDR", "#APS" \n\t" \
|
||||
GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
|
||||
" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \
|
||||
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
|
||||
|
||||
#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
|
||||
|
||||
#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
|
||||
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \
|
||||
" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
|
||||
|
||||
#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
|
||||
" fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
|
||||
" fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
|
||||
|
||||
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_double.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_double.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use double precision.
|
||||
#define DT "d"
|
||||
#define LD1 "ld1d"
|
||||
#define ST1 "st1d"
|
||||
#define LD1R "ld1rd"
|
||||
#define PRFG "prfd"
|
||||
#define SZ "8"
|
||||
#define OFFS "lsl #3"
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_half.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_half.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use half precision.
|
||||
#define DT "h"
|
||||
#define LD1 "ld1h"
|
||||
#define ST1 "st1h"
|
||||
#define LD1R "ld1rh"
|
||||
#define PRFG "prfh"
|
||||
#define SZ "2"
|
||||
// #define OFFS UNSUPPORTED
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
46
kernels/armsve/3/armsve_asm_macros_single.h
Normal file
46
kernels/armsve/3/armsve_asm_macros_single.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use single precision.
|
||||
#define DT "s"
|
||||
#define LD1 "ld1w"
|
||||
#define ST1 "st1w"
|
||||
#define LD1R "ld1rw"
|
||||
#define PRFG "prfw"
|
||||
#define SZ "4"
|
||||
#define OFFS "uxtw #2"
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
318
kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
Normal file
318
kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
Normal file
@@ -0,0 +1,318 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
void bli_dgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" incd x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.d \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rd z21.d, p0/z, [x1, 8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x1, 16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x1, 24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x1, 32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x1, 40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x1, 48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x1, 56] \n\t"
|
||||
" ld1rd z28.d, p0/z, [x1, 64] \n\t"
|
||||
" ld1rd z29.d, p0/z, [x1, 72] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ldr x4, [x4] \n\t" // Load alpha & beta (value).
|
||||
" ldr x8, [x8] \n\t"
|
||||
" dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.d, x8 \n\t"
|
||||
" fmov d28, #1.0 \n\t" // Prepare FP 1.0.
|
||||
" fmov x16, d28 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" prfm PLDL1STRM, [x0] \n\t"
|
||||
" prfm PLDL1STRM, [x0, 256*1] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
// " prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL1STRM, [x1] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 256*1] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
// " prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t" // Preload first half of C for contiguous case.
|
||||
" b.ne WRITE_MEM \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" cmp x16, x4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
// First half of C is already loaded in this case.
|
||||
GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
307
kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
Normal file
307
kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
Normal file
@@ -0,0 +1,307 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Single-precision composite instructions.
|
||||
#include "armsve_asm_macros_single.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
void bli_sgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" incw x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #4 \n\t" // Multiply some address skips by sizeof(float).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.s \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
|
||||
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
|
||||
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
|
||||
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
|
||||
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
|
||||
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
|
||||
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rw z21.s, p0/z, [x1, 4] \n\t"
|
||||
" ld1rw z22.s, p0/z, [x1, 8] \n\t"
|
||||
" ld1rw z23.s, p0/z, [x1, 12] \n\t"
|
||||
" ld1rw z24.s, p0/z, [x1, 16] \n\t"
|
||||
" ld1rw z25.s, p0/z, [x1, 20] \n\t"
|
||||
" ld1rw z26.s, p0/z, [x1, 24] \n\t"
|
||||
" ld1rw z27.s, p0/z, [x1, 28] \n\t"
|
||||
" ld1rw z28.s, p0/z, [x1, 32] \n\t"
|
||||
" ld1rw z29.s, p0/z, [x1, 36] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ldr w4, [x4] \n\t" // Load alpha & beta (value).
|
||||
" ldr w8, [x8] \n\t"
|
||||
" dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.s, w8 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x0] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov s28, #1.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
343
kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
Normal file
343
kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
Normal file
@@ -0,0 +1,343 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Half-precision composite instructions.
|
||||
#include "armsve_asm_macros_half.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "armsve_asm_2vx10.h"
|
||||
|
||||
// Gather-load / scatter-store instruction for half-precision
|
||||
// needs being defined separately.
|
||||
#undef GEMM_CCOL_GATHER_LOAD_FWD
|
||||
#undef GEMM_CCOL_SCATTER_STORE_FWD
|
||||
|
||||
#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" add x28, "#CADDR", "#CRS2" \n\t" \
|
||||
" ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
|
||||
" fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" add x28, "#CTEMP", "#CRS2" \n\t" \
|
||||
" ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
|
||||
" fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
|
||||
" add x28, "#CADDR", "#CRS2" \n\t" \
|
||||
" st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \
|
||||
" st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
|
||||
" add x28, "#CTEMP", "#CRS2" \n\t" \
|
||||
" st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \
|
||||
" st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CCS" \n\t"
|
||||
|
||||
|
||||
void bli_shgemm_armsve_asm_2vx10_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
void* restrict alpha,
|
||||
void* restrict a,
|
||||
void* restrict b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" inch x2, ALL, MUL #2 \n\t" // Column-skip of A.
|
||||
" mov x3, #10 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x8, 0x3 \n\t" // Tag C address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x5, x5, x8 \n\t"
|
||||
" mov x8, 0x2 \n\t" // Tag B address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x1, x1, x8 \n\t"
|
||||
" mov x8, 0x1 \n\t" // Tag A address.
|
||||
" lsl x8, x8, #56 \n\t"
|
||||
" orr x0, x0, x8 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" ptrue p0.b \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp x4, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
|
||||
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
|
||||
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
|
||||
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
|
||||
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
|
||||
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
|
||||
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
|
||||
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x4, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" \n\t"
|
||||
" subs x4, x4, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" add x0, x0, x2 \n\t" // Forward A's address to the next column.
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
|
||||
" add x0, x0, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x8, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
|
||||
" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row.
|
||||
" ld1rh z21.h, p0/z, [x1, 2] \n\t"
|
||||
" ld1rh z22.h, p0/z, [x1, 4] \n\t"
|
||||
" ld1rh z23.h, p0/z, [x1, 6] \n\t"
|
||||
" ld1rh z24.h, p0/z, [x1, 8] \n\t"
|
||||
" ld1rh z25.h, p0/z, [x1, 10] \n\t"
|
||||
" ld1rh z26.h, p0/z, [x1, 12] \n\t"
|
||||
" ld1rh z27.h, p0/z, [x1, 14] \n\t"
|
||||
" ld1rh z28.h, p0/z, [x1, 16] \n\t"
|
||||
" ld1rh z29.h, p0/z, [x1, 18] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x0, x0, x2 \n\t" // Forward A.
|
||||
" add x1, x1, x3 \n\t" // Forward B.
|
||||
" sub x8, x8, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors.
|
||||
" ld1rh z31.h, p0/z, [x8] \n\t"
|
||||
" fmov w4, h28 \n\t" // Copy alpha & beta to GP registers.
|
||||
" fmov w8, h29 \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x0] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x0, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov h28, #1.0 \n\t"
|
||||
" fmov w16, h28 \n\t"
|
||||
" cmp w16, w4 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x10, xzr \n\t"
|
||||
" incb x10 \n\t"
|
||||
" madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" mov x28, #2 \n\t"
|
||||
" madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case.
|
||||
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
|
||||
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
|
||||
" \n\t"
|
||||
" dup z31.h, w8 \n\t" // Restore beta destroyed by loading.
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x16","x10","x28",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
450
kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
Normal file
450
kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
Normal file
@@ -0,0 +1,450 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Separate instantiation for ArmSVE reference kernels.
|
||||
// Temporary workaround. Will be removed after upstream has switched to a better way
|
||||
// of exposing gemmsup interface.
|
||||
|
||||
//
|
||||
// -- Row storage case ---------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
|
||||
|
||||
//
|
||||
// -- Column storage case ------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )
|
||||
|
||||
@@ -0,0 +1,528 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "../armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "../armsve_asm_2vx10.h"
|
||||
|
||||
// Prototype reference kernel.
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 )
|
||||
|
||||
void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
static int called = 0;
|
||||
if ( !called )
|
||||
{
|
||||
fprintf(stderr, "rv called.\n");
|
||||
called = 1;
|
||||
}
|
||||
// c*c requires A to be stored in columns.
|
||||
assert( rs_a0 == 1 );
|
||||
|
||||
dim_t n0_mker = n0 / 10;
|
||||
dim_t n0_left = n0 % 10;
|
||||
|
||||
if ( n0_left )
|
||||
{
|
||||
// A[:, ::]
|
||||
// B[::, n0_mker*10:n0]
|
||||
// C[: , n0_mker*10:n0]
|
||||
double *ai = a;
|
||||
double *bi = b + n0_mker * 10 * cs_b0;
|
||||
double *ci = c + n0_mker * 10 * cs_c0;
|
||||
bli_dgemmsup_c_armsve_ref2
|
||||
(
|
||||
conja, conjb,
|
||||
m0, n0_left, k0,
|
||||
alpha,
|
||||
ai, rs_a0, cs_a0,
|
||||
bi, rs_b0, cs_b0,
|
||||
beta,
|
||||
ci, rs_c0, cs_c0,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Return if it's a pure edge case.
|
||||
if ( !n0_mker )
|
||||
return;
|
||||
|
||||
// Determine VL.
|
||||
uint64_t vlen2;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incd x0, ALL, MUL #2 \n\t"
|
||||
" mov %[vlen2], x0 \n\t"
|
||||
: [vlen2] "=r" (vlen2)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t rs_a = 1;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t cs_b = cs_b0;
|
||||
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t n_mker = n0_mker;
|
||||
|
||||
dim_t m0_mker = m0 / vlen2;
|
||||
dim_t m0_left = m0 % vlen2;
|
||||
if ( m0_left )
|
||||
{
|
||||
// Edge case on A side can be handled with one more (predicated) loop.
|
||||
m0_mker++;
|
||||
} else
|
||||
m0_left = vlen2;
|
||||
// uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
|
||||
for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
|
||||
{
|
||||
uint64_t m_curr = vlen2;
|
||||
if ( im0_mker == m0_mker - 1 )
|
||||
{
|
||||
// Last m-loop. Maybe unnecessary.
|
||||
m_curr = m0_left;
|
||||
}
|
||||
double *ai = a + im0_mker * vlen2 * rs_a0;
|
||||
double *bi = b;
|
||||
double *ci = c + im0_mker * vlen2 * rs_c0;
|
||||
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[bi] \n\t"
|
||||
" ldr x1, %[rs_b] \n\t" // Row-skip of B.
|
||||
" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]).
|
||||
" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B.
|
||||
" ldr x4, %[cs_a] \n\t" // Column-Skip of A.
|
||||
" \n\t" // Element skip of A[:, l] is guaranteed to be 1.
|
||||
" ldr x5, %[ci] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag C address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x5, x5, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x1, x8, x1, xzr \n\t" // rs_b
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_b
|
||||
" madd x3, x8, x3, xzr \n\t" // ps_b
|
||||
" madd x4, x8, x4, xzr \n\t" // cs_a
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" mov x8, #4 \n\t"
|
||||
" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A.
|
||||
" \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x20 \n\t" // Higher 6bit for Control#2:
|
||||
" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
|
||||
" orr x16, x16, x4 \n\t" // Stride.
|
||||
" msr S3_3_C11_C6_2, x16 \n\t" // Write system register.
|
||||
#endif
|
||||
" \n\t"
|
||||
" ldr x8, %[m_curr] \n\t" // Size of first dimension.
|
||||
" mov x9, xzr \n\t"
|
||||
" incd x9 \n\t"
|
||||
" ptrue p0.d \n\t"
|
||||
" whilelo p1.d, xzr, x8 \n\t"
|
||||
" whilelo p2.d, x9, x8 \n\t"
|
||||
" \n\t"
|
||||
" ldr x8, %[n_mker] \n\t" // Number of N-loops.
|
||||
" \n\t"
|
||||
" ldr x20, %[ai] \n\t" // Parameters to be reloaded
|
||||
" ldr x21, %[k_mker] \n\t" // within each millikernel loop.
|
||||
" ldr x22, %[k_left] \n\t"
|
||||
" ldr x23, %[alpha] \n\t"
|
||||
" ldr x24, %[beta] \n\t"
|
||||
" ldr x25, %[a_next] \n\t"
|
||||
" ldr x26, %[b_next] \n\t"
|
||||
" ldr x23, [x23] \n\t" // Directly load alpha and beta.
|
||||
" ldr x24, [x24] \n\t"
|
||||
" \n\t"
|
||||
" MILLIKER_MLOOP: \n\t"
|
||||
" \n\t"
|
||||
" mov x11, x0 \n\t" // B's address.
|
||||
// " ldr x10, %[ai] \n\t" // A's address.
|
||||
" mov x10, x20 \n\t"
|
||||
// " ldr x12, %[k_mker] \n\t"
|
||||
" mov x12, x21 \n\t"
|
||||
// " ldr x13, %[k_left] \n\t"
|
||||
" mov x13, x22 \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x3 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x10, x10, x16 \n\t"
|
||||
" mov x16, 0xa \n\t" // Control#2 for A address.
|
||||
" lsl x16, x16, #60 \n\t"
|
||||
" orr x10, x10, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
" \n\t"
|
||||
" mov x14, x11 \n\t"
|
||||
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z21.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z22.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z23.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z24.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z25.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z26.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z27.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x14] \n\t"
|
||||
" sub x14, x14, x2 \n\t" // Restore x14 to load edge.
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A.
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x10, x4 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, x5 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" add x16, x16, x7 \n\t"
|
||||
" prfm PLDL1STRM, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
|
||||
" add x10, x10, x4 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x13, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
|
||||
" mov x14, x11 \n\t"
|
||||
" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B.
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z21.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z22.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z23.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z24.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z25.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z26.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z27.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z28.d, p0/z, [x14] \n\t"
|
||||
" add x14, x14, x2 \n\t"
|
||||
" ld1rd z29.d, p0/z, [x14] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x10, x10, x4 \n\t" // Forward A.
|
||||
" add x11, x11, x1 \n\t" // Forward B.
|
||||
" sub x13, x13, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
// " ldr x10, %[ai] \n\t"
|
||||
" mov x10, x20 \n\t"
|
||||
" add x11, x0, x3 \n\t"
|
||||
" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors.
|
||||
" dup z31.d, x24 \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.eq PREFETCH_ABNEXT \n\t"
|
||||
" prfm PLDL1STRM, [x10] \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" add x11, x11, x2 \n\t"
|
||||
" prfm PLDL1KEEP, [x11] \n\t"
|
||||
" b WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
|
||||
" mov x1, x25 \n\t"
|
||||
// " ldr x2, %[b_next] \n\t"
|
||||
" mov x2, x26 \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov d28, #1.0 \n\t"
|
||||
" fmov x16, d28 \n\t"
|
||||
" cmp x16, x23 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
|
||||
" incb x13 \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x12, xzr \n\t"
|
||||
" incb x12 \n\t"
|
||||
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" subs x8, x8, #1 \n\t"
|
||||
" b.eq END_EXEC \n\t"
|
||||
" \n\t" // Address of C already forwarded to next column.
|
||||
" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel.
|
||||
" b MILLIKER_MLOOP \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [bi] "m" (bi),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b] "m" (ps_b),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ci] "m" (ci),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[m_curr] "m" (m_curr),
|
||||
[n_mker] "m" (n_mker),
|
||||
[ai] "m" (ai),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x10","x11","x12","x13","x14","x15","x16","x17",
|
||||
"x20","x21","x22","x23","x24","x25","x26",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemmsup_rv_armsve_10x2v_unindexed
|
||||
(
|
||||
conj_t conjat,
|
||||
conj_t conjbt,
|
||||
dim_t m0t,
|
||||
dim_t n0t,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict at, inc_t rs_at0, inc_t cs_at0,
|
||||
double* restrict bt, inc_t rs_bt0, inc_t cs_bt0,
|
||||
double* restrict beta,
|
||||
double* restrict ct, inc_t rs_ct0, inc_t cs_ct0,
|
||||
auxinfo_t* restrict datat,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
auxinfo_t data;
|
||||
bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
|
||||
bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
|
||||
bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
|
||||
bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
|
||||
bli_dgemmsup_cv_armsve_2vx10_unindexed
|
||||
(
|
||||
conjbt, conjat,
|
||||
n0t, m0t, k0,
|
||||
alpha,
|
||||
bt, cs_bt0, rs_bt0,
|
||||
at, cs_at0, rs_at0,
|
||||
beta,
|
||||
ct, cs_ct0, rs_ct0,
|
||||
&data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,412 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "../armsve_asm_macros_double.h"
|
||||
|
||||
// 2vx10 microkernels.
|
||||
#include "../armsve_asm_2vx10.h"
|
||||
|
||||
// Prototype reference kernel.
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 )
|
||||
|
||||
void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
static int called = 0;
|
||||
if ( !called )
|
||||
{
|
||||
fprintf(stderr, "rv called.\n");
|
||||
called = 1;
|
||||
}
|
||||
// r*r requires B to be stored in rows.
|
||||
assert(cs_b0 == 1);
|
||||
|
||||
dim_t n0_mker = n0 / 10;
|
||||
dim_t n0_left = n0 % 10;
|
||||
|
||||
if ( n0_left )
|
||||
{
|
||||
// A[:, ::]
|
||||
// B[::, n0_mker*10:n0]
|
||||
// C[: , n0_mker*10:n0]
|
||||
double *ai = a;
|
||||
double *bi = b + n0_mker * 10 * cs_b0;
|
||||
double *ci = c + n0_mker * 10 * cs_c0;
|
||||
bli_dgemmsup_r_armsve_ref2
|
||||
(
|
||||
conja, conjb,
|
||||
m0, n0_left, k0,
|
||||
alpha,
|
||||
ai, rs_a0, cs_a0,
|
||||
bi, rs_b0, cs_b0,
|
||||
beta,
|
||||
ci, rs_c0, cs_c0,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Return if it's a pure edge case.
|
||||
if ( !n0_mker )
|
||||
return;
|
||||
|
||||
// Determine VL.
|
||||
uint64_t vlen2;
|
||||
__asm__ (
|
||||
" mov x0, xzr \n\t"
|
||||
" incd x0, ALL, MUL #2 \n\t"
|
||||
" mov %[vlen2], x0 \n\t"
|
||||
: [vlen2] "=r" (vlen2)
|
||||
:
|
||||
: "x0"
|
||||
);
|
||||
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
// uint64_t cs_b = 1;
|
||||
|
||||
uint64_t k_mker = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
uint64_t m_mker = m0 / vlen2;
|
||||
uint64_t m_left = m0 % vlen2;
|
||||
if ( m_left )
|
||||
{
|
||||
// Edge case on A side can be handled with one more (predicated) loop.
|
||||
m_mker++;
|
||||
} else
|
||||
m_left = vlen2;
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
// uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
|
||||
for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
|
||||
{
|
||||
double *ai = a;
|
||||
double *bi = b + in0_mker * 10 * cs_b0;
|
||||
double *ci = c + in0_mker * 10 * cs_c0;
|
||||
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
__asm__ volatile (
|
||||
" ldr x0, %[ai] \n\t"
|
||||
" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]).
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A.
|
||||
" ldr x4, %[rs_b] \n\t" // Row-Skip of B.
|
||||
" \n\t" // Element skip of B[l, :] is guaranteed to be 1.
|
||||
" ldr x5, %[ci] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag C address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x5, x5, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x0, x0, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double).
|
||||
" madd x2, x8, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x8, x3, xzr \n\t" // ps_a
|
||||
" madd x4, x8, x4, xzr \n\t" // rs_b
|
||||
" madd x7, x8, x7, xzr \n\t" // cs_c
|
||||
" mov x8, xzr \n\t"
|
||||
" incb x8 \n\t"
|
||||
" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip.
|
||||
" mov x8, #4 \n\t"
|
||||
" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A.
|
||||
// " mov x8, #4 \n\t"
|
||||
// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B.
|
||||
" \n\t"
|
||||
" ldr x8, %[m_mker] \n\t" // Number of M-loops.
|
||||
" ptrue p0.d \n\t"
|
||||
" ptrue p1.d \n\t"
|
||||
" ptrue p2.d \n\t"
|
||||
" \n\t"
|
||||
" MILLIKER_MLOOP: \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.ne UKER_BEGIN \n\t"
|
||||
" \n\t"
|
||||
" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop.
|
||||
" mov x11, xzr \n\t"
|
||||
" incd x11 \n\t"
|
||||
" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2.
|
||||
" whilelo p2.d, x11, x10 \n\t"
|
||||
" \n\t"
|
||||
" UKER_BEGIN: \n\t"
|
||||
" mov x10, x0 \n\t" // A's address.
|
||||
" ldr x11, %[bi] \n\t" // B's address.
|
||||
" ldr x12, %[k_mker] \n\t"
|
||||
" ldr x13, %[k_left] \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x3 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr x11, x11, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x16, x11 \n\t" // Prefetch first kernel of B.
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, x4 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" \n\t"
|
||||
" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row.
|
||||
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t" // First A column.
|
||||
" \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
|
||||
" \n\t"
|
||||
CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
|
||||
" \n\t"
|
||||
" cmp x12, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t" // Unroll the 4-loop.
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
" index z29.d, xzr, x1 \n\t"
|
||||
GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
|
||||
" add x10, x10, x2 \n\t" // Forward A to fill the blank.
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp x13, #0 \n\t"
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
" index z31.d, xzr, x1 \n\t"
|
||||
GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
|
||||
" ld1rd z20.d, p0/z, [x11] \n\t"
|
||||
" ld1rd z21.d, p0/z, [x11, #8] \n\t"
|
||||
" ld1rd z22.d, p0/z, [x11, #16] \n\t"
|
||||
" ld1rd z23.d, p0/z, [x11, #24] \n\t"
|
||||
" ld1rd z24.d, p0/z, [x11, #32] \n\t"
|
||||
" ld1rd z25.d, p0/z, [x11, #40] \n\t"
|
||||
" ld1rd z26.d, p0/z, [x11, #48] \n\t"
|
||||
" ld1rd z27.d, p0/z, [x11, #56] \n\t"
|
||||
" ld1rd z28.d, p0/z, [x11, #64] \n\t"
|
||||
" ld1rd z29.d, p0/z, [x11, #72] \n\t"
|
||||
GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
|
||||
GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
|
||||
GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
|
||||
GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
|
||||
GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
|
||||
GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
|
||||
GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
|
||||
GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
|
||||
GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
|
||||
GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
|
||||
" add x10, x10, x2 \n\t" // Forward A.
|
||||
" add x11, x11, x4 \n\t" // Forward B.
|
||||
" sub x13, x13, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
" ldr x11, %[bi] \n\t"
|
||||
" ldr x12, %[alpha] \n\t" // Load alpha & beta.
|
||||
" ldr x13, %[beta] \n\t"
|
||||
" ld1rd z30.d, p0/z, [x12] \n\t"
|
||||
" ld1rd z31.d, p0/z, [x13] \n\t"
|
||||
" ldr x12, [x12] \n\t"
|
||||
" \n\t"
|
||||
" cmp x8, #1 \n\t"
|
||||
" b.eq PREFETCH_ABNEXT \n\t"
|
||||
" prfm PLDL2STRM, [x11] \n\t"
|
||||
" b WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed.
|
||||
" ldr x2, %[b_next] \n\t"
|
||||
" prfm PLDL2KEEP, [x1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*9] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*10] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*11] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*12] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*13] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*14] \n\t"
|
||||
" prfm PLDL2KEEP, [x1, 256*15] \n\t"
|
||||
" prfm PLDL2KEEP, [x2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*1] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*2] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*3] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*4] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*5] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*6] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*7] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*8] \n\t"
|
||||
" prfm PLDL2KEEP, [x2, 256*9] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
" fmov d28, #1.0 \n\t"
|
||||
" fmov x16, d28 \n\t"
|
||||
" cmp x16, x12 \n\t"
|
||||
" b.eq UNIT_ALPHA \n\t"
|
||||
" \n\t"
|
||||
SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" mov x10, x5 \n\t" // C address for storing.
|
||||
" cmp x6, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" mov x13, xzr \n\t" // C-column's physical 1-vector skip.
|
||||
" incb x13 \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-30] - Z30 as index.
|
||||
" mov x12, xzr \n\t"
|
||||
" incb x12 \n\t"
|
||||
" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" subs x8, x8, #1 \n\t"
|
||||
" b.eq END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel.
|
||||
" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x5, x5, x13 \n\t"
|
||||
" b MILLIKER_MLOOP \n\t"
|
||||
" \n\t"
|
||||
" END_ERROR: \n\t"
|
||||
" mov x0, #1 \n\t" // Return error.
|
||||
" END_EXEC: \n\t"
|
||||
" mov x0, #0 \n\t" // Return normal.
|
||||
:
|
||||
: [ai] "m" (ai),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a] "m" (ps_a),
|
||||
[rs_b] "m" (rs_b),
|
||||
[ci] "m" (ci),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[m_mker] "m" (m_mker),
|
||||
[m_left] "m" (m_left),
|
||||
[bi] "m" (bi),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next)
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
|
||||
"x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,5 +33,13 @@
|
||||
*/
|
||||
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed )
|
||||
|
||||
PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_12xk )
|
||||
PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk )
|
||||
|
||||
Reference in New Issue
Block a user