mirror of
https://github.com/amd/blis.git
synced 2026-05-21 17:08:17 +00:00
Arm64 dgemmsup with extended MR&NR (#655)
Details: - Since the number of registers in NEON is large but their lengths are short, I'm here extending both MR and NR. - The approach is to represent the C microtile in registers optionally in columns, so for sizes like 6x7m, the 'crr' kernel is the default with 'rrr' supported through an in-register transpose. - A few asm kernels are crafted for 'rv' to complete this extended size support. - For 'rd' I'm still relying heavily on C99 intrinsic kernels with branching so the performance might not be optimal. (Sorry for that.) - So far, these changes only affect the 'firestorm' subconfig. - This commit also contains row-preferential s12x8 and d6x8 gemm ukernels. These microkernels are templatized versions of the existing s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c.
This commit is contained in:
@@ -49,14 +49,14 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
|
||||
cntx,
|
||||
|
||||
// level-3
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r,
|
||||
|
||||
// packm
|
||||
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
|
||||
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
|
||||
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
|
||||
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
|
||||
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
|
||||
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
|
||||
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
|
||||
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
|
||||
|
||||
// gemmsup
|
||||
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
|
||||
@@ -77,8 +77,8 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
|
||||
cntx,
|
||||
|
||||
// level-3
|
||||
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||
|
||||
// gemmsup
|
||||
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||
@@ -95,11 +95,11 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4096, 3072, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 9600, 8184, -1, -1 );
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
@@ -110,8 +110,10 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1,
|
||||
-1, 9, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1,
|
||||
-1, 13, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 );
|
||||
|
||||
@@ -61,6 +61,18 @@
|
||||
CLEAR4V(V4,V5,V6,V7)
|
||||
|
||||
// Scale vectors.
|
||||
#define SSCALE1V(V,A,IDX) \
|
||||
" fmul v"#V".4s, v"#V".4s, v"#A".s["#IDX"] \n\t"
|
||||
#define SSCALE2V(V0,V1,A,IDX) \
|
||||
SSCALE1V(V0,A,IDX) \
|
||||
SSCALE1V(V1,A,IDX)
|
||||
#define SSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
SSCALE2V(V0,V1,A,IDX) \
|
||||
SSCALE2V(V2,V3,A,IDX)
|
||||
#define SSCALE8V(V0,V1,V2,V3,V4,V5,V6,V7,A,IDX) \
|
||||
SSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
SSCALE4V(V4,V5,V6,V7,A,IDX)
|
||||
|
||||
#define DSCALE1V(V,A,IDX) \
|
||||
" fmul v"#V".2d, v"#V".2d, v"#A".d["#IDX"] \n\t"
|
||||
#define DSCALE2V(V0,V1,A,IDX) \
|
||||
@@ -74,6 +86,18 @@
|
||||
DSCALE4V(V4,V5,V6,V7,A,IDX)
|
||||
|
||||
// Scale-accumulate.
|
||||
#define SSCALEA1V(D,S,A,IDX) \
|
||||
" fmla v"#D".4s, v"#S".4s, v"#A".s["#IDX"] \n\t"
|
||||
#define SSCALEA2V(D0,D1,S0,S1,A,IDX) \
|
||||
SSCALEA1V(D0,S0,A,IDX) \
|
||||
SSCALEA1V(D1,S1,A,IDX)
|
||||
#define SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
SSCALEA2V(D0,D1,S0,S1,A,IDX) \
|
||||
SSCALEA2V(D2,D3,S2,S3,A,IDX)
|
||||
#define SSCALEA8V(D0,D1,D2,D3,D4,D5,D6,D7,S0,S1,S2,S3,S4,S5,S6,S7,A,IDX) \
|
||||
SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
SSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX)
|
||||
|
||||
#define DSCALEA1V(D,S,A,IDX) \
|
||||
" fmla v"#D".2d, v"#S".2d, v"#A".d["#IDX"] \n\t"
|
||||
#define DSCALEA2V(D0,D1,S0,S1,A,IDX) \
|
||||
@@ -95,8 +119,16 @@
|
||||
#define DLOAD4V(V0,V1,V2,V3,ADDR,SHIFT) \
|
||||
DLOAD2V(V0,V1,ADDR,SHIFT) \
|
||||
DLOAD2V(V2,V3,ADDR,SHIFT+32)
|
||||
#define SLOAD1V DLOAD1V
|
||||
#define SLOAD2V DLOAD2V
|
||||
#define SLOAD4V DLOAD4V
|
||||
|
||||
// Generic: load one line.
|
||||
#define SLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \
|
||||
" ld1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \
|
||||
" ld1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \
|
||||
" ld1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \
|
||||
" ld1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t"
|
||||
#define DLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \
|
||||
" ld1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \
|
||||
" ld1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t"
|
||||
@@ -110,8 +142,16 @@
|
||||
#define DSTORE4V(V0,V1,V2,V3,ADDR,SHIFT) \
|
||||
DSTORE2V(V0,V1,ADDR,SHIFT) \
|
||||
DSTORE2V(V2,V3,ADDR,SHIFT+32)
|
||||
#define SSTORE1V DSTORE1V
|
||||
#define SSTORE2V DSTORE2V
|
||||
#define SSTORE4V DSTORE4V
|
||||
|
||||
// Generic: store one line.
|
||||
#define SSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \
|
||||
" st1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \
|
||||
" st1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \
|
||||
" st1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \
|
||||
" st1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t"
|
||||
#define DSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \
|
||||
" st1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \
|
||||
" st1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t"
|
||||
|
||||
605
kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
Normal file
605
kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
Normal file
@@ -0,0 +1,605 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Label locality & misc.
|
||||
#include "armv8a_asm_utils.h"
|
||||
|
||||
// Nanokernel operations.
|
||||
#include "armv8a_asm_d2x2.h"
|
||||
|
||||
/* Order of row-major SGEMM_12x8's execution in 4x5 blocks:
|
||||
*
|
||||
* +---+ +---+
|
||||
* | 0 | | 1 |
|
||||
* +---+ +---+
|
||||
* +---+ +---+
|
||||
* | 2 | | 3 |
|
||||
* +---+ +---+
|
||||
* +---+ +---+
|
||||
* | 4 | | 5 |
|
||||
* +---+ +---+
|
||||
*/
|
||||
#define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
|
||||
SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \
|
||||
SGEMM_4X4_NANOKERNEL(C01,C11,C21,C31,B1,A0) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) /* Contiguous load is the same across S/D. */ \
|
||||
SGEMM_4X4_NANOKERNEL(C40,C50,C60,C70,B0,A1) \
|
||||
SGEMM_4X4_NANOKERNEL(C41,C51,C61,C71,B1,A1) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \
|
||||
SGEMM_4X4_NANOKERNEL(C80,C90,CA0,CB0,B0,A2) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
|
||||
SGEMM_4X4_NANOKERNEL(C81,C91,CA1,CB1,B1,A2)
|
||||
|
||||
// For contiguous storage of C, SLOAD is the same as DLOAD.
|
||||
#define SLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
#define SSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
/* Order of row-major DGEMM_8x6's execution in 2x2 blocks:
|
||||
*
|
||||
* +---+ +---+ +---+
|
||||
* | 0 | | 2 | | 4 |
|
||||
* +---+ +---+ +---+
|
||||
* +---+ +---+ +---+
|
||||
* | 1 | | 3 | | 5 |
|
||||
* +---+ +---+ +---+
|
||||
* +---+ +---+ +---+
|
||||
* | 6 | | 8 | | 10|
|
||||
* +---+ +---+ +---+
|
||||
* +---+ +---+ +---+
|
||||
* | 7 | | 9 | | 11|
|
||||
* +---+ +---+ +---+
|
||||
*
|
||||
*/
|
||||
#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
|
||||
DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \
|
||||
DGEMM_2X2_NANOKERNEL(C60,C70,B0,A3) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \
|
||||
DGEMM_2X2_NANOKERNEL(C61,C71,B1,A3) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
|
||||
DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \
|
||||
DGEMM_2X2_NANOKERNEL(C62,C72,B2,A3)
|
||||
|
||||
// Interleaving load or not.
|
||||
#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
|
||||
#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
DLOAD1V(V1,ADDR,IMM)
|
||||
|
||||
#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
|
||||
#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
|
||||
DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
|
||||
|
||||
// For contiguous storage of C.
|
||||
#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
DLOAD1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
DSTORE1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
// Prefetch C.
|
||||
#define PRFMC_FWD(CADDR,RSC,LASTB) \
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" prfm PLDL1KEEP, ["#CADDR", "#LASTB"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
void bli_sgemm_armv8a_asm_12x8r
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const void* a_next = bli_auxinfo_next_a( data );
|
||||
const void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
GEMM_UKR_SETUP_CT( s, 12, 8, true );
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, #12 \n\t" // Column-skip of A.
|
||||
" mov x3, #8 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1)
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(float).
|
||||
" lsl x2, x2, #2 \n\t" // cs_a
|
||||
" lsl x3, x3, #2 \n\t" // rs_b
|
||||
" lsl x6, x6, #2 \n\t" // rs_c
|
||||
" \n\t"
|
||||
" cmp %w[ct], wzr \n\t"
|
||||
" mov x9, x5 \n\t"
|
||||
BNE(SEND_PRFMC_FH)
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 01/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 02/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 03/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 04/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 05/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 06/12.
|
||||
LABEL(SEND_PRFMC_FH)
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:23] <- C
|
||||
// V[24:27] <- A
|
||||
// V[28:31] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(SLOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(SCLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" ldr q24, [x0, #16*0] \n\t" // Load A.
|
||||
" ldr q25, [x0, #16*1] \n\t"
|
||||
" ldr q26, [x0, #16*2] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q27, [x0, #16*0] \n\t"
|
||||
" \n\t"
|
||||
" cmp %w[ct], wzr \n\t"
|
||||
BNE(SEND_PRFMC_LH)
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 07/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 08/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 09/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 10/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 11/12.
|
||||
PRFMC_FWD(x9,x6,32) // Prefetch C 12/12.
|
||||
LABEL(SEND_PRFMC_LH)
|
||||
" cmp x4, #0 \n\t" // Reset branching flag.
|
||||
" \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q30, [x1, #16*0] \n\t"
|
||||
" ldr q31, [x1, #16*1] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
LABEL(SCLEAR_CCOLS)
|
||||
CLEAR8V(0,1,2,3,4,5,6,7)
|
||||
CLEAR8V(8,9,10,11,12,13,14,15)
|
||||
CLEAR8V(16,17,18,19,20,21,22,23)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(SK_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1) \
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,x0,16,x1,0,load) \
|
||||
"add x0, x0, x2 \n\t" \
|
||||
"ldr q"#A2", [x0, #16*0] \n\t" \
|
||||
"ldr q"#B1", [x1, #16*1] \n\t" \
|
||||
"add x1, x1, x3 \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(SK_MKER_LOOP)
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29)
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,30,31)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(SFIN_MKER_LOOP)
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29)
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,30,31)
|
||||
BRANCH(SK_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(SFIN_MKER_LOOP)
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,xzr,-1,xzr,-1,noload)
|
||||
" ldr q26, [x0, #16*1] \n\t"
|
||||
" ldr q27, [x0, #16*2] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC(25,26,27,30,31,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(SK_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(SWRITE_MEM_PREP)
|
||||
" ldr q24, [x0, #16*0] \n\t" // Load A col.
|
||||
" ldr q25, [x0, #16*1] \n\t"
|
||||
" ldr q26, [x0, #16*2] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
SGEMM_12X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(SK_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(SWRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v24.4s}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v25.4s}, [x8] \n\t"
|
||||
" \n\t"
|
||||
LABEL(SPREFETCH_ABNEXT)
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size,
|
||||
" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions
|
||||
" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher.
|
||||
" prfm PLDL1STRM, [x1, 64*0] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 64*1] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 64*3] \n\t"
|
||||
" \n\t"
|
||||
" fmov d26, #1.0 \n\t"
|
||||
" fcvt s26, d26 \n\t"
|
||||
" fcmp s24, s26 \n\t"
|
||||
BEQ(SUNIT_ALPHA)
|
||||
SSCALE8V(0,1,2,3,4,5,6,7,24,0)
|
||||
SSCALE8V(8,9,10,11,12,13,14,15,24,0)
|
||||
SSCALE8V(16,17,18,19,20,21,22,23,24,0)
|
||||
LABEL(SUNIT_ALPHA)
|
||||
" \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
//
|
||||
// Contiguous C-storage.
|
||||
LABEL(SWRITE_MEM_R)
|
||||
" fcmp s25, #0.0 \n\t" // Sets conditional flag whether *beta == 0.
|
||||
" \n\t" // This conditional flag will be used
|
||||
" \n\t" // multiple times for skipping load.
|
||||
// Row 0 & 1 & 2:
|
||||
BEQ(SZERO_BETA_R_0_1_2)
|
||||
SLOADC_2V_R_FWD(26,27,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(28,29,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(30,31,x9,0,x6)
|
||||
SSCALEA2V(0,1,26,27,25,0)
|
||||
SSCALEA2V(2,3,28,29,25,0)
|
||||
SSCALEA2V(4,5,30,31,25,0)
|
||||
LABEL(SZERO_BETA_R_0_1_2)
|
||||
SSTOREC_2V_R_FWD(0,1,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(2,3,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(4,5,x5,0,x6)
|
||||
// Row 3 & 4 & 5 & 6 & 7 & 8:
|
||||
BEQ(SZERO_BETA_R_3_4_5_6_7_8)
|
||||
SLOADC_2V_R_FWD(26,27,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(28,29,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(30,31,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(0,1,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(2,3,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(4,5,x9,0,x6)
|
||||
SSCALEA4V(6,7,8,9,26,27,28,29,25,0)
|
||||
SSCALEA4V(10,11,12,13,30,31,0,1,25,0)
|
||||
SSCALEA4V(14,15,16,17,2,3,4,5,25,0)
|
||||
LABEL(SZERO_BETA_R_3_4_5_6_7_8)
|
||||
SSTOREC_2V_R_FWD(6,7,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(8,9,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(10,11,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(12,13,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(14,15,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(16,17,x5,0,x6)
|
||||
// Row 9 & 10 & 11
|
||||
BEQ(SZERO_BETA_R_9_10_11)
|
||||
SLOADC_2V_R_FWD(26,27,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(28,29,x9,0,x6)
|
||||
SLOADC_2V_R_FWD(30,31,x9,0,x6)
|
||||
SSCALEA2V(18,19,26,27,25,0)
|
||||
SSCALEA2V(20,21,28,29,25,0)
|
||||
SSCALEA2V(22,23,30,31,25,0)
|
||||
LABEL(SZERO_BETA_R_9_10_11)
|
||||
SSTOREC_2V_R_FWD(18,19,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(20,21,x5,0,x6)
|
||||
SSTOREC_2V_R_FWD(22,23,x5,0,x6)
|
||||
// Done.
|
||||
LABEL(SEND_WRITE_MEM)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next),
|
||||
[ct] "r" (_use_ct) // Defined by macro.
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",
|
||||
"v0","v1","v2","v3","v4","v5","v6","v7",
|
||||
"v8","v9","v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19",
|
||||
"v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27",
|
||||
"v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
GEMM_UKR_FLUSH_CT( s );
|
||||
}
|
||||
|
||||
/*
|
||||
* Differences from the col-major 6x8 in HW modeling:
|
||||
* * Stream HW prefetcher is assumed s.t. PRFM instructions for packed A&B are omitted.
|
||||
*/
|
||||
void bli_dgemm_armv8a_asm_8x6r
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const void* a_next = bli_auxinfo_next_a( data );
|
||||
const void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
GEMM_UKR_SETUP_CT( d, 8, 6, true );
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x1, %[b] \n\t"
|
||||
" mov x2, #8 \n\t" // Column-skip of A.
|
||||
" mov x3, #6 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x5, %[c] \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1)
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(double).
|
||||
" lsl x2, x2, #3 \n\t" // cs_a
|
||||
" lsl x3, x3, #3 \n\t" // rs_b
|
||||
" lsl x6, x6, #3 \n\t" // rs_c
|
||||
" \n\t"
|
||||
" cmp %w[ct], wzr \n\t"
|
||||
" mov x9, x5 \n\t"
|
||||
BNE(DEND_PRFMC)
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 1/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 2/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 3/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 4/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 5/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 6/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 7/8.
|
||||
PRFMC_FWD(x9,x6,40) // Prefetch C 8/8.
|
||||
LABEL(DEND_PRFMC)
|
||||
" \n\t"
|
||||
" ldr x4, %[k_mker] \n\t" // Number of loops.
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:23] <- C
|
||||
// V[24:27] <- A
|
||||
// V[28:31] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(DLOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(DCLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" ldr q24, [x0, #16*0] \n\t" // Load A.
|
||||
" ldr q25, [x0, #16*1] \n\t"
|
||||
" ldr q26, [x0, #16*2] \n\t"
|
||||
" ldr q27, [x0, #16*3] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q31, [x1, #16*0] \n\t"
|
||||
LABEL(DCLEAR_CCOLS)
|
||||
CLEAR8V(0,1,2,3,4,5,6,7)
|
||||
CLEAR8V(8,9,10,11,12,13,14,15)
|
||||
CLEAR8V(16,17,18,19,20,21,22,23)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(DK_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2) \
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load) \
|
||||
"add x1, x1, x3 \n\t" \
|
||||
"ldr q"#B2", [x1, #16*0] \n\t" \
|
||||
"ldr q"#A2", [x0, #16*2] \n\t" \
|
||||
"ldr q"#A3", [x0, #16*3] \n\t" \
|
||||
"add x0, x0, x2 \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(DK_MKER_LOOP)
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30)
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(DFIN_MKER_LOOP)
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28)
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31)
|
||||
BRANCH(DK_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(DFIN_MKER_LOOP)
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load)
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q26, [x0, #16*2] \n\t"
|
||||
" ldr q27, [x0, #16*3] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(DK_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(DWRITE_MEM_PREP)
|
||||
" ldr q24, [x0, #16*0] \n\t" // Load A col.
|
||||
" ldr q25, [x0, #16*1] \n\t"
|
||||
" ldr q26, [x0, #16*2] \n\t"
|
||||
" ldr q27, [x0, #16*3] \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(DK_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(DWRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v25.2d}, [x8] \n\t"
|
||||
" \n\t"
|
||||
LABEL(DPREFETCH_ABNEXT)
|
||||
" ldr x0, %[a_next] \n\t"
|
||||
" ldr x1, %[b_next] \n\t"
|
||||
" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size,
|
||||
" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions
|
||||
" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher.
|
||||
" prfm PLDL1STRM, [x1, 64*0] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 64*1] \n\t"
|
||||
" prfm PLDL1STRM, [x1, 64*3] \n\t"
|
||||
" \n\t"
|
||||
" fmov d26, #1.0 \n\t"
|
||||
" fcmp d24, d26 \n\t"
|
||||
BEQ(DUNIT_ALPHA)
|
||||
DSCALE8V(0,1,2,3,4,5,6,7,24,0)
|
||||
DSCALE8V(8,9,10,11,12,13,14,15,24,0)
|
||||
DSCALE8V(16,17,18,19,20,21,22,23,24,0)
|
||||
LABEL(DUNIT_ALPHA)
|
||||
" \n\t"
|
||||
" mov x9, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
//
|
||||
// Contiguous C-storage.
|
||||
LABEL(DWRITE_MEM_R)
|
||||
" fcmp d25, #0.0 \n\t" // Sets conditional flag whether *beta == 0.
|
||||
" \n\t" // This conditional flag will be used
|
||||
" \n\t" // multiple times for skipping load.
|
||||
// Row 0 & 1:
|
||||
BEQ(DZERO_BETA_R_0_1)
|
||||
DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
|
||||
DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
|
||||
DSCALEA2V(0,1,26,27,25,0)
|
||||
DSCALEA2V(2,3,28,29,25,0)
|
||||
DSCALEA2V(4,5,30,31,25,0)
|
||||
LABEL(DZERO_BETA_R_0_1)
|
||||
DSTOREC_3V_R_FWD(0,1,2,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(3,4,5,x5,0,x6)
|
||||
// Row 2 & 3 & 4 & 5:
|
||||
BEQ(DZERO_BETA_R_2_3_4_5)
|
||||
DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
|
||||
DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
|
||||
DLOADC_3V_R_FWD(0,1,2,x9,0,x6)
|
||||
DLOADC_3V_R_FWD(3,4,5,x9,0,x6)
|
||||
DSCALEA4V(6,7,8,9,26,27,28,29,25,0)
|
||||
DSCALEA4V(10,11,12,13,30,31,0,1,25,0)
|
||||
DSCALEA4V(14,15,16,17,2,3,4,5,25,0)
|
||||
LABEL(DZERO_BETA_R_2_3_4_5)
|
||||
DSTOREC_3V_R_FWD(6,7,8,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(9,10,11,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(12,13,14,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(15,16,17,x5,0,x6)
|
||||
// Row 6 & 7
|
||||
BEQ(DZERO_BETA_R_6_7)
|
||||
DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
|
||||
DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
|
||||
DSCALEA2V(18,19,26,27,25,0)
|
||||
DSCALEA2V(20,21,28,29,25,0)
|
||||
DSCALEA2V(22,23,30,31,25,0)
|
||||
LABEL(DZERO_BETA_R_6_7)
|
||||
DSTOREC_3V_R_FWD(18,19,20,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(21,22,23,x5,0,x6)
|
||||
// Done.
|
||||
LABEL(DEND_WRITE_MEM)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next),
|
||||
[ct] "r" (_use_ct) // Defined by macro.
|
||||
: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",
|
||||
"v0","v1","v2","v3","v4","v5","v6","v7",
|
||||
"v8","v9","v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19",
|
||||
"v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27",
|
||||
"v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
GEMM_UKR_FLUSH_CT( d );
|
||||
}
|
||||
|
||||
@@ -1,450 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Separate instantiation for Armv8-A reference kernels.
|
||||
// Temporary workaround. Will be removed after upstream has switched to a better way
|
||||
// of exposing gemmsup interface.
|
||||
|
||||
//
|
||||
// -- Row storage case ---------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by rows. */ \
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict ci = &c[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a[ i*rs_a ]; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cij = &ci[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b [ j*cs_b ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 )
|
||||
|
||||
//
|
||||
// -- Column storage case ------------------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This microkernel can actually handle arbitrarily large
|
||||
values of m, n, and k. */ \
|
||||
\
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
|
||||
{ \
|
||||
/* Traverse c by columns. */ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict cj = &c[ j*cs_c ]; \
|
||||
ctype* restrict bj = &b[ j*cs_b ]; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict cij = &cj[ i*rs_c ]; \
|
||||
ctype* restrict ai = &a [ i*rs_a ]; \
|
||||
ctype ab; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( ab ); \
|
||||
\
|
||||
/* Perform a dot product to update the (i,j) element of c. */ \
|
||||
for ( dim_t l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict aij = &ai[ l*cs_a ]; \
|
||||
ctype* restrict bij = &bj[ l*rs_b ]; \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
|
||||
} \
|
||||
\
|
||||
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
|
||||
PASTEMAC(ch,conjs)( ab ); \
|
||||
\
|
||||
/* If beta is one, add ab into c. If beta is zero, overwrite c
|
||||
with the result in ab. Otherwise, scale by beta and accumulate
|
||||
ab to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 )
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
@@ -109,6 +108,83 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
|
||||
BLIS_INLINE
|
||||
void bli_dgemmsup_rd_armv8a_inline_3x4m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( n0 == 4 );
|
||||
|
||||
for ( ; m0 >= 3; m0 -= 3 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_asm_3x4
|
||||
(
|
||||
conja, conjb, 3, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
a += 3 * rs_a0;
|
||||
c += 3 * rs_c0;
|
||||
}
|
||||
|
||||
if ( m0 > 0 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, m0, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE
|
||||
void bli_dgemmsup_rd_armv8a_inline_3xcm
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
for ( ; m0 > 0; m0 -= 3 )
|
||||
{
|
||||
dim_t m_loc = ( m0 < 3 ) ? m0 : 3;
|
||||
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, m_loc, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
|
||||
a += 3 * rs_a0;
|
||||
c += 3 * rs_c0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bli_dgemmsup_rd_armv8a_asm_6x8m
|
||||
(
|
||||
conj_t conja,
|
||||
@@ -127,58 +203,74 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
|
||||
{
|
||||
if ( n0 != 8 )
|
||||
{
|
||||
if ( n0 < 8 )
|
||||
assert( n0 <= 13 );
|
||||
|
||||
// Manual separation.
|
||||
dgemmsup_ker_ft ker_fp1 = NULL;
|
||||
dgemmsup_ker_ft ker_fp2 = NULL;
|
||||
dgemmsup_ker_ft ker_fp3 = NULL;
|
||||
dim_t nr1, nr2, nr3;
|
||||
|
||||
switch ( n0 )
|
||||
{
|
||||
for ( ; n0 >= 4; n0 -= 4 )
|
||||
{
|
||||
dim_t m = m0;
|
||||
double *a_loc = a;
|
||||
double *c_loc = c;
|
||||
|
||||
for ( ; m >= 3; m -= 3 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_asm_3x4
|
||||
(
|
||||
conja, conjb, 3, 4, k0,
|
||||
alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c_loc, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
a_loc += 3 * rs_a0;
|
||||
c_loc += 3 * rs_c0;
|
||||
}
|
||||
|
||||
if ( m > 0 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, m, 4, k0,
|
||||
alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c_loc, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
b += 4 * cs_b0;
|
||||
c += 4 * cs_c0;
|
||||
}
|
||||
|
||||
for ( ; m0 > 0; m0 -= 3 )
|
||||
{
|
||||
dim_t m_loc = ( m0 < 3 ) ? m0 : 3;
|
||||
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, m_loc, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
|
||||
a += 3 * rs_a0;
|
||||
c += 3 * rs_c0;
|
||||
}
|
||||
case 13:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3;
|
||||
ker_fp3 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr3 = 2; break;
|
||||
case 12:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr2 = 4; break;
|
||||
case 11:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break;
|
||||
case 10:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
|
||||
case 9:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 1; break;
|
||||
case 7:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4;
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break;
|
||||
case 6:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4;
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
|
||||
case 5:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 3;
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
|
||||
case 4:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 4; break;
|
||||
default:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = n0; break;
|
||||
}
|
||||
else
|
||||
|
||||
ker_fp1
|
||||
(
|
||||
conja, conjb, m0, nr1, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += nr1 * cs_b0;
|
||||
c += nr1 * cs_c0;
|
||||
if ( ker_fp2 )
|
||||
{
|
||||
assert( FALSE );
|
||||
ker_fp2
|
||||
(
|
||||
conja, conjb, m0, nr2, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += nr2 * cs_b0;
|
||||
c += nr2 * cs_c0;
|
||||
}
|
||||
if ( ker_fp3 )
|
||||
ker_fp3
|
||||
(
|
||||
conja, conjb, m0, nr3, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
@@ -102,6 +101,122 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
|
||||
BLIS_INLINE
|
||||
void bli_dgemmsup_rd_armv8a_inline_4x8n
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( m0 == 4 );
|
||||
|
||||
for ( ; n0 > 0; n0 -= 8 )
|
||||
{
|
||||
// Call twice the 2xc kernel in column order.
|
||||
dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, 2, n_loc, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, 2, n_loc, k0,
|
||||
alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 8 * cs_b0;
|
||||
c += 8 * cs_c0;
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE
|
||||
void bli_dgemmsup_rd_armv8a_inline_3x8n
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( m0 == 3 );
|
||||
|
||||
for ( ; n0 >= 4; n0 -= 4 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_asm_3x4
|
||||
(
|
||||
conja, conjb, 3, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 4 * cs_b0;
|
||||
c += 4 * cs_c0;
|
||||
}
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, 3, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE
|
||||
void bli_dgemmsup_rd_armv8a_inline_rx8n
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( m0 <= 2 );
|
||||
|
||||
for ( ; n0 > 0; n0 -= 8 )
|
||||
{
|
||||
dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, m0, n_loc, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 8 * cs_b0;
|
||||
c += 8 * cs_c0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bli_dgemmsup_rd_armv8a_asm_6x8n
|
||||
(
|
||||
conj_t conja,
|
||||
@@ -120,116 +235,51 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
|
||||
{
|
||||
if ( m0 != 6 )
|
||||
{
|
||||
if ( m0 < 6 )
|
||||
{
|
||||
if ( m0 == 5 )
|
||||
{
|
||||
// 3xk calls.
|
||||
dim_t n = n0;
|
||||
double *b_loc = b;
|
||||
double *c_loc = c;
|
||||
for ( ; n >= 4; n -= 4 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_asm_3x4
|
||||
(
|
||||
conja, conjb, 3, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0,
|
||||
beta, c_loc, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b_loc += 4 * cs_b0;
|
||||
c_loc += 4 * cs_c0;
|
||||
}
|
||||
if ( n > 0 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, 3, n, k0,
|
||||
alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0,
|
||||
beta, c_loc, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
a += 3 * rs_a0;
|
||||
c += 3 * rs_c0;
|
||||
assert( m0 <= 9 );
|
||||
|
||||
// 2xk calls.
|
||||
for ( ; n0 > 0; n0 -= 8 )
|
||||
{
|
||||
dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, 2, n_loc, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 8 * cs_b0;
|
||||
c += 8 * cs_c0;
|
||||
}
|
||||
return;
|
||||
}
|
||||
else if ( m0 == 4 )
|
||||
{
|
||||
for ( ; n0 > 0; n0 -= 8 )
|
||||
{
|
||||
dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, 2, n_loc, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, 2, n_loc, k0,
|
||||
alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 8 * cs_b0;
|
||||
c += 8 * cs_c0;
|
||||
}
|
||||
}
|
||||
else if ( m0 == 3 )
|
||||
{
|
||||
for ( ; n0 >= 4; n0 -= 4 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_asm_3x4
|
||||
(
|
||||
conja, conjb, 3, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 4 * cs_b0;
|
||||
c += 4 * cs_c0;
|
||||
}
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
bli_dgemmsup_rd_armv8a_int_3x4
|
||||
(
|
||||
conja, conjb, 3, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else // m0 == 2 or 1.
|
||||
{
|
||||
for ( ; n0 > 0; n0 -= 8 )
|
||||
{
|
||||
dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
|
||||
bli_dgemmsup_rd_armv8a_int_2x8
|
||||
(
|
||||
conja, conjb, m0, n_loc, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += 8 * cs_b0;
|
||||
c += 8 * cs_c0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
// Manual separation.
|
||||
dgemmsup_ker_ft ker_fp1 = NULL;
|
||||
dgemmsup_ker_ft ker_fp2 = NULL;
|
||||
dim_t mr1, mr2;
|
||||
|
||||
switch ( m0 )
|
||||
{
|
||||
assert( FALSE );
|
||||
case 9:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr2 = 3; break;
|
||||
case 8:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function.
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break;
|
||||
case 7:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3;
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr2 = 4; break;
|
||||
case 5:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3;
|
||||
ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break;
|
||||
case 4:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr1 = 4; break;
|
||||
case 3:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; break;
|
||||
default:
|
||||
ker_fp1 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr1 = m0; break;
|
||||
}
|
||||
|
||||
ker_fp1
|
||||
(
|
||||
conja, conjb, mr1, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
a += mr1 * rs_a0;
|
||||
c += mr1 * rs_c0;
|
||||
if ( ker_fp2 )
|
||||
ker_fp2
|
||||
(
|
||||
conja, conjb, mr2, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
@@ -76,6 +75,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
// For row-storage of C.
|
||||
#define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
|
||||
DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
@@ -83,6 +83,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
// For column-storage of C.
|
||||
#define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \
|
||||
DLOAD2V(C00,C10,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t" \
|
||||
|
||||
482
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
Normal file
482
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
Normal file
@@ -0,0 +1,482 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
|
||||
// Nanokernel operations.
|
||||
#include "../armv8a_asm_d2x2.h"
|
||||
|
||||
/* Order of row-major DGEMM_6x8's execution in 2x2 blocks:
|
||||
*
|
||||
* +---+ +---+ +---+ +---+
|
||||
* | 0 | | 1 | | 6 | | 7 |
|
||||
* +---+ +---+ +---+ +---+
|
||||
* +---+ +---+ +---+ +---+
|
||||
* | 2 | | 3 | | 8 | | 9 |
|
||||
* +---+ +---+ +---+ +---+
|
||||
* ----- ----- ----- -----
|
||||
* 4 5 10 11
|
||||
*/
|
||||
#define DGEMM_5X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
|
||||
" fmla v"#C40".2d, v"#B0".2d, v"#A2".d[0] \n\t" \
|
||||
" fmla v"#C41".2d, v"#B1".2d, v"#A2".d[0] \n\t" \
|
||||
DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
|
||||
DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
|
||||
" fmla v"#C42".2d, v"#B2".2d, v"#A2".d[0] \n\t" \
|
||||
" fmla v"#C43".2d, v"#B3".2d, v"#A2".d[0] \n\t"
|
||||
|
||||
// Interleaving load or not.
|
||||
#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
|
||||
#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t"
|
||||
|
||||
#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
|
||||
#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
|
||||
DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
|
||||
|
||||
#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
|
||||
#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
|
||||
" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
|
||||
" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
|
||||
|
||||
// Prefetch C in the long direction.
|
||||
#define DPRFMC_FWD(CADDR,DLONGC) \
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
// For row-storage of C.
|
||||
#define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
|
||||
DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
#define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
|
||||
DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
// For column-storage of C: Store 2+1/2 vectors.
|
||||
#define DLOADC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
#define DSTOREC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
|
||||
#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \
|
||||
DSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
DSCALE1V(V4,A,IDX)
|
||||
#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \
|
||||
DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
DSCALEA1V(D4,S4,A,IDX)
|
||||
|
||||
|
||||
void bli_dgemmsup_rv_armv8a_asm_5x8n
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( m0 == 5 );
|
||||
|
||||
// LLVM has very bad routing ability for inline asm.
|
||||
// Limit number of registers in case of Clang compilation.
|
||||
#ifndef __clang__
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
#endif
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 6;
|
||||
uint64_t k_left = k0 % 6;
|
||||
|
||||
int64_t n_iter = n0 / 8;
|
||||
int64_t n_left = n0 % 8;
|
||||
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t cs_b = cs_b0;
|
||||
assert( cs_b0 == 1 );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x10, %[b] \n\t"
|
||||
" ldr x13, %[c] \n\t"
|
||||
" ldr x12, %[n_iter] \n\t"
|
||||
" ldr x11, %[ps_b] \n\t" // Panel-skip of B.
|
||||
" ldr x3, %[rs_b] \n\t" // Row-skip of B.
|
||||
" ldr x9, %[rs_a] \n\t" // Row-skip of A.
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(double).
|
||||
" lsl x11, x11, #3 \n\t" // ps_b
|
||||
" lsl x9, x9, #3 \n\t" // rs_a
|
||||
" lsl x2, x2, #3 \n\t" // cs_a
|
||||
" lsl x3, x3, #3 \n\t" // rs_b
|
||||
" lsl x6, x6, #3 \n\t" // rs_c
|
||||
" lsl x7, x7, #3 \n\t" // cs_c
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t"
|
||||
" cmp x7, #8 \n\t" // Prefetch column-strided C.
|
||||
BEQ(C_PREFETCH_COLS)
|
||||
DPRFMC_FWD(x1,x6)
|
||||
DPRFMC_FWD(x1,x6)
|
||||
DPRFMC_FWD(x1,x6)
|
||||
DPRFMC_FWD(x1,x6)
|
||||
DPRFMC_FWD(x1,x6)
|
||||
BRANCH(C_PREFETCH_END)
|
||||
LABEL(C_PREFETCH_COLS)
|
||||
// This prefetch will not cover further mker perts. Skip.
|
||||
//
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
// DPRFMC_FWD(x1,x7)
|
||||
LABEL(C_PREFETCH_END)
|
||||
//
|
||||
// Millikernel.
|
||||
LABEL(MILLIKER_MLOOP)
|
||||
" \n\t"
|
||||
" mov x1, x10 \n\t" // Parameters to be reloaded
|
||||
" mov x5, x13 \n\t" // within each millikernel loop.
|
||||
" ldr x0, %[a] \n\t"
|
||||
" ldr x4, %[k_mker] \n\t"
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:19] <- C
|
||||
// V[20:25] <- A
|
||||
// V[26:31] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(LOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(CLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" ldr q26, [x1, #16*0] \n\t" // Load B first.
|
||||
" ldr q27, [x1, #16*1] \n\t"
|
||||
" ldr q28, [x1, #16*2] \n\t"
|
||||
" ldr q29, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q30, [x1, #16*0] \n\t"
|
||||
" ldr q31, [x1, #16*1] \n\t"
|
||||
" \n\t"
|
||||
" mov x14, x0 \n\t" // Load A.
|
||||
" ld1 {v20.d}[0], [x14], x9 \n\t" // We want A to be kept in L1.
|
||||
" ld1 {v20.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v24.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v24.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v25.d}[0], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
LABEL(CLEAR_CCOLS)
|
||||
CLEAR8V(0,1,2,3,4,5,6,7)
|
||||
CLEAR8V(8,9,10,11,12,13,14,15)
|
||||
CLEAR4V(16,17,18,19)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(K_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,16*2,load) \
|
||||
"add x1, x1, x3 \n\t" \
|
||||
"ldr q"#B2", [x1, #16*0] \n\t" /* Next B line. */ \
|
||||
"ldr q"#B3", [x1, #16*1] \n\t" \
|
||||
"ld1 {v"#A2".d}[0], [x14], x9 \n\t" /* Finish A line. */ \
|
||||
"add x0, x0, x2 \n\t" \
|
||||
"mov x14, x0 \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(K_MKER_LOOP)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,26,27,28,29)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,30,31,26,27)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,28,29,30,31)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,26,27,28,29)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(FIN_MKER_LOOP)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,30,31,26,27)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31)
|
||||
BRANCH(K_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(FIN_MKER_LOOP)
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,30,31,26,27,xzr,-1,xzr,-1,noload)
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" ldr q31, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC(23,24,25,28,29,30,31,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(K_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(WRITE_MEM_PREP)
|
||||
" ldr q26, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q27, [x1, #16*1] \n\t"
|
||||
" ldr q28, [x1, #16*2] \n\t"
|
||||
" ldr q29, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v20.d}[0], [x14], x9 \n\t" // Load A col.
|
||||
" ld1 {v20.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,26,27,28,29,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(K_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(WRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v31.2d}, [x8] \n\t"
|
||||
" fmov d20, #1.0 \n\t"
|
||||
" fcmp d30, d20 \n\t"
|
||||
BEQ(UNIT_ALPHA_R)
|
||||
DSCALE8V(0,1,2,3,4,5,6,7,30,0)
|
||||
DSCALE8V(8,9,10,11,12,13,14,15,30,0)
|
||||
DSCALE4V(16,17,18,19,30,0)
|
||||
LABEL(UNIT_ALPHA_R)
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x7, #8 \n\t" // Check for column-storage.
|
||||
BNE(WRITE_MEM_C)
|
||||
//
|
||||
// C storage in rows.
|
||||
LABEL(WRITE_MEM_R)
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_R_1_2)
|
||||
DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6)
|
||||
DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6)
|
||||
DSCALEA4V(0,1,2,3,20,21,22,23,31,0)
|
||||
DSCALEA4V(4,5,6,7,24,25,26,27,31,0)
|
||||
LABEL(ZERO_BETA_R_1_2)
|
||||
DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6)
|
||||
DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6)
|
||||
BEQ(ZERO_BETA_R_3_4_5)
|
||||
DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6)
|
||||
DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6)
|
||||
DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6)
|
||||
DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,31,0)
|
||||
DSCALEA4V(16,17,18,19,0,1,2,3,31,0)
|
||||
LABEL(ZERO_BETA_R_3_4_5)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_R)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_R)
|
||||
#endif
|
||||
DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6)
|
||||
DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6)
|
||||
DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6)
|
||||
BRANCH(END_WRITE_MEM)
|
||||
//
|
||||
// C storage in columns.
|
||||
LABEL(WRITE_MEM_C)
|
||||
// In-register transpose,
|
||||
// do transposition in row-order.
|
||||
" trn1 v20.2d, v0.2d, v4.2d \n\t" // Row 0-1.
|
||||
" trn2 v21.2d, v0.2d, v4.2d \n\t"
|
||||
" trn1 v22.2d, v1.2d, v5.2d \n\t"
|
||||
" trn2 v23.2d, v1.2d, v5.2d \n\t"
|
||||
" trn1 v24.2d, v2.2d, v6.2d \n\t"
|
||||
" trn2 v25.2d, v2.2d, v6.2d \n\t"
|
||||
" trn1 v26.2d, v3.2d, v7.2d \n\t"
|
||||
" trn2 v27.2d, v3.2d, v7.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v0.2d, v8.2d, v12.2d \n\t" // Row 2-3.
|
||||
" trn2 v1.2d, v8.2d, v12.2d \n\t"
|
||||
" trn1 v2.2d, v9.2d, v13.2d \n\t"
|
||||
" trn2 v3.2d, v9.2d, v13.2d \n\t"
|
||||
" trn1 v4.2d, v10.2d, v14.2d \n\t"
|
||||
" trn2 v5.2d, v10.2d, v14.2d \n\t"
|
||||
" trn1 v6.2d, v11.2d, v15.2d \n\t"
|
||||
" trn2 v7.2d, v11.2d, v15.2d \n\t"
|
||||
" \n\t"
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_C_1_2_3_4)
|
||||
DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8)
|
||||
DSCALEA5V(20,0,21,1,16,8,9,11,12,10,31,0)
|
||||
DSCALEA5V(22,2,23,3,17,13,14,28,29,15,31,0)
|
||||
LABEL(ZERO_BETA_C_1_2_3_4)
|
||||
DSTOREC_2PHV_C_FWD(20,0,16,0,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(21,1,16,1,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(22,2,17,0,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(23,3,17,1,x5,0,x7,x8)
|
||||
BEQ(ZERO_BETA_C_5_6_7_8)
|
||||
DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8)
|
||||
DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8)
|
||||
DSCALEA5V(24,4,25,5,18,8,9,11,12,10,31,0)
|
||||
DSCALEA5V(26,6,27,7,19,13,14,28,29,15,31,0)
|
||||
LABEL(ZERO_BETA_C_5_6_7_8)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_C)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_C)
|
||||
#endif
|
||||
DSTOREC_2PHV_C_FWD(24,4,18,0,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(25,5,18,1,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(26,6,19,0,x5,0,x7,x8)
|
||||
DSTOREC_2PHV_C_FWD(27,7,19,1,x5,0,x7,x8)
|
||||
//
|
||||
// End of this microkernel.
|
||||
LABEL(END_WRITE_MEM)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t"
|
||||
BEQ(END_EXEC)
|
||||
" \n\t"
|
||||
" mov x8, #8 \n\t"
|
||||
" madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x10, x10, x11 \n\t" // Forward B's base address to the next logic panel.
|
||||
BRANCH(MILLIKER_MLOOP)
|
||||
//
|
||||
// End of execution.
|
||||
LABEL(END_EXEC)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_b] "m" (ps_b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
// In Clang, even "m"-passed parameter takes 1 register.
|
||||
// Have to disable prefetching to pass compilation.
|
||||
#ifndef __clang__
|
||||
[a_next] "r" (a_next),
|
||||
[b_next] "r" (b_next),
|
||||
#endif
|
||||
[n_iter] "m" (n_iter),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
consider_edge_cases:
|
||||
// Forward address.
|
||||
b = b + n_iter * ps_b;
|
||||
c = c + n_iter * 8 * cs_c;
|
||||
if ( n_left )
|
||||
{
|
||||
// Set panel stride to unpacked mode.
|
||||
// Only 1 millikernel w.r.t. 6x8 is executed.
|
||||
auxinfo_t data_d6x4mn = *data;
|
||||
bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
|
||||
//
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
(
|
||||
conja, conjb, 5, n_left, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
475
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
Normal file
475
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
Normal file
@@ -0,0 +1,475 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
|
||||
// Nanokernel operations.
|
||||
#include "../armv8a_asm_d2x2.h"
|
||||
|
||||
/* Odd-NR dgemmsup_rv_*m kernels are special in that
|
||||
* despite of the row-major name, C is laid out in COLUMNS in the register space.
|
||||
*
|
||||
* Block order:
|
||||
*
|
||||
* +---+ +---+
|
||||
* | 0 | | 3 | |6
|
||||
* +---+ +---+ |
|
||||
* +---+ +---+
|
||||
* | 1 | | 4 | |7
|
||||
* +---+ +---+ |
|
||||
* +---+ +---+
|
||||
* | 2 | | 5 | |8
|
||||
* +---+ +---+ |
|
||||
*
|
||||
*/
|
||||
#define DGEMM_C6X5_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C10,C11,C12,C13,C14,C20,C21,C22,C23,C24,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \
|
||||
DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \
|
||||
DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \
|
||||
DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \
|
||||
DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
|
||||
" fmla v"#C04".2d, v"#A0".2d, v"#B2".d["#BIDX"] \n\t" \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
|
||||
" fmla v"#C14".2d, v"#A1".2d, v"#B2".d["#BIDX"] \n\t" \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
|
||||
" fmla v"#C24".2d, v"#A2".2d, v"#B2".d["#BIDX"] \n\t"
|
||||
|
||||
// Interleaving load or not.
|
||||
#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
|
||||
#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t"
|
||||
|
||||
#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
|
||||
#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
|
||||
" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
|
||||
" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
|
||||
|
||||
// Prefetch C in the long direction.
|
||||
#define DPRFMC_FWD(CADDR,DLONGC) \
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
// For column-storage of C.
|
||||
#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
DLOAD1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
DSTORE1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
|
||||
// For row-storage of C: Store 2+1/2 vectors.
|
||||
#define DLOADC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
#define DSTOREC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \
|
||||
DSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
DSCALE1V(V4,A,IDX)
|
||||
#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \
|
||||
DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
DSCALEA1V(D4,S4,A,IDX)
|
||||
|
||||
|
||||
void bli_dgemmsup_rv_armv8a_asm_6x5m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( n0 == 5 );
|
||||
|
||||
// LLVM has very bad routing ability for inline asm.
|
||||
// Limit number of registers in case of Clang compilation.
|
||||
#ifndef __clang__
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
#endif
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 6;
|
||||
uint64_t k_left = k0 % 6;
|
||||
|
||||
int64_t m_iter = m0 / 6;
|
||||
int64_t m_left = m0 % 6;
|
||||
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t cs_b = cs_b0;
|
||||
assert( cs_b0 == 1 );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x10, %[a] \n\t"
|
||||
" ldr x13, %[c] \n\t"
|
||||
" ldr x12, %[m_iter] \n\t"
|
||||
" ldr x11, %[ps_a] \n\t" // Panel-skip of A.
|
||||
" ldr x9, %[rs_a] \n\t" // Row-skip of A.
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" ldr x3, %[rs_b] \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(double).
|
||||
" lsl x11, x11, #3 \n\t" // ps_a
|
||||
" lsl x9, x9, #3 \n\t" // rs_a
|
||||
" lsl x2, x2, #3 \n\t" // cs_a
|
||||
" lsl x3, x3, #3 \n\t" // rs_b
|
||||
" lsl x6, x6, #3 \n\t" // rs_c
|
||||
" lsl x7, x7, #3 \n\t" // cs_c
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t"
|
||||
" cmp x7, #8 \n\t" // Prefetch column-strided C.
|
||||
BNE(C_PREFETCH_COLS)
|
||||
// This prefetch will not cover further mker perts. Skip.
|
||||
//
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
BRANCH(C_PREFETCH_END)
|
||||
LABEL(C_PREFETCH_COLS)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
LABEL(C_PREFETCH_END)
|
||||
//
|
||||
// Millikernel.
|
||||
LABEL(MILLIKER_MLOOP)
|
||||
" \n\t"
|
||||
" mov x0, x10 \n\t" // Parameters to be reloaded
|
||||
" mov x5, x13 \n\t" // within each millikernel loop.
|
||||
" ldr x1, %[b] \n\t"
|
||||
" ldr x4, %[k_mker] \n\t"
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:14] <- C
|
||||
// V[15:23] <- A
|
||||
// V[24:29] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(LOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(CLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" mov x14, x0 \n\t" // Load A.
|
||||
" ld1 {v15.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v15.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v16.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v16.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v17.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v17.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v18.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v18.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" \n\t"
|
||||
" ldr q24, [x1, #16*0] \n\t" // Load B.
|
||||
" ldr q25, [x1, #16*1] \n\t"
|
||||
" ldr d26, [x1, #16*2] \n\t" // Scalar loads into idx 0.
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q27, [x1, #16*0] \n\t"
|
||||
" ldr q28, [x1, #16*1] \n\t"
|
||||
" ldr d29, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
LABEL(CLEAR_CCOLS)
|
||||
CLEAR4V(0,1,2,3)
|
||||
CLEAR1V(4)
|
||||
CLEAR4V(5,6,7,8)
|
||||
CLEAR1V(9)
|
||||
CLEAR4V(10,11,12,13)
|
||||
CLEAR1V(14)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(K_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,BIDX) \
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,x14,x9,x1,0,load) \
|
||||
"ld1 {v"#A2".d}[0], [x14], x9 \n\t" \
|
||||
"ld1 {v"#A2".d}[1], [x14], x9 \n\t" \
|
||||
"add x0, x0, x2 \n\t" \
|
||||
"mov x14, x0 \n\t" \
|
||||
/* Due to this loading, BIDX can only be 0 here. */ \
|
||||
"ldr d"#B2", [x1, #16*2] \n\t" \
|
||||
"add x1, x1, x3 \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(K_MKER_LOOP)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,24,25,26,0)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,27,28,29,0)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,24,25,26,0)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(FIN_MKER_LOOP)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,27,28,29,0)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26,0)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29,0)
|
||||
BRANCH(K_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(FIN_MKER_LOOP)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,27,28,29,0,xzr,-1,xzr,-1,noload)
|
||||
" ldr q27, [x1, #16*0] \n\t"
|
||||
" ldr q28, [x1, #16*1] \n\t"
|
||||
" ldr d29, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,0,xzr,-1,xzr,-1,noload)
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(21,22,23,27,28,29,0,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(K_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(WRITE_MEM_PREP)
|
||||
" mov x14, x0 \n\t" // Load A col.
|
||||
" ld1 {v15.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v15.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v16.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v16.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v17.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v17.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q24, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q25, [x1, #16*1] \n\t"
|
||||
" ldr d26, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,24,25,26,0,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(K_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(WRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v31.2d}, [x8] \n\t"
|
||||
" fmov d26, #1.0 \n\t"
|
||||
" fcmp d30, d26 \n\t"
|
||||
BEQ(UNIT_ALPHA)
|
||||
DSCALE5V(0,1,2,3,4,30,0)
|
||||
DSCALE5V(5,6,7,8,9,30,0)
|
||||
DSCALE5V(10,11,12,13,14,30,0)
|
||||
LABEL(UNIT_ALPHA)
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x7, #8 \n\t" // Check for column-storage.
|
||||
BNE(WRITE_MEM_C)
|
||||
// Unlike other RV kernels, here row-storage of C requires
|
||||
// in-register transpose.
|
||||
" trn1 v15.2d, v0.2d, v1.2d \n\t"
|
||||
" trn2 v16.2d, v0.2d, v1.2d \n\t"
|
||||
" trn1 v17.2d, v2.2d, v3.2d \n\t"
|
||||
" trn2 v18.2d, v2.2d, v3.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v19.2d, v5.2d, v6.2d \n\t"
|
||||
" trn2 v20.2d, v5.2d, v6.2d \n\t"
|
||||
" trn1 v21.2d, v7.2d, v8.2d \n\t"
|
||||
" trn2 v22.2d, v7.2d, v8.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v23.2d, v10.2d, v11.2d \n\t"
|
||||
" trn2 v24.2d, v10.2d, v11.2d \n\t"
|
||||
" trn1 v25.2d, v12.2d, v13.2d \n\t"
|
||||
" trn2 v26.2d, v12.2d, v13.2d \n\t"
|
||||
" \n\t"
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_R)
|
||||
DLOADC_2PHV_R_FWD(0,1,28,0,x1,0,x6,x8)
|
||||
DLOADC_2PHV_R_FWD(2,3,28,1,x1,0,x6,x8)
|
||||
DLOADC_2PHV_R_FWD(5,6,29,0,x1,0,x6,x8)
|
||||
DLOADC_2PHV_R_FWD(7,8,29,1,x1,0,x6,x8)
|
||||
DLOADC_2PHV_R_FWD(10,11,30,0,x1,0,x6,x8)
|
||||
DLOADC_2PHV_R_FWD(12,13,30,1,x1,0,x6,x8)
|
||||
DSCALEA5V(15,17,16,18,4,0,1,2,3,28,31,0)
|
||||
DSCALEA5V(19,21,20,22,9,5,6,7,8,29,31,0)
|
||||
DSCALEA5V(23,25,24,26,14,10,11,12,13,30,31,0)
|
||||
LABEL(ZERO_BETA_R)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_R)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_R)
|
||||
#endif
|
||||
DSTOREC_2PHV_R_FWD(15,17,4,0,x5,0,x6,x8)
|
||||
DSTOREC_2PHV_R_FWD(16,18,4,1,x5,0,x6,x8)
|
||||
DSTOREC_2PHV_R_FWD(19,21,9,0,x5,0,x6,x8)
|
||||
DSTOREC_2PHV_R_FWD(20,22,9,1,x5,0,x6,x8)
|
||||
DSTOREC_2PHV_R_FWD(23,25,14,0,x5,0,x6,x8)
|
||||
DSTOREC_2PHV_R_FWD(24,26,14,1,x5,0,x6,x8)
|
||||
BRANCH(END_WRITE_MEM)
|
||||
//
|
||||
// C storage in columns.
|
||||
LABEL(WRITE_MEM_C)
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_C)
|
||||
DLOADC_3V_C_FWD(15,20,25,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(16,21,26,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(17,22,27,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(18,23,28,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(19,24,29,x1,0,x7)
|
||||
DSCALEA5V(0,1,2,3,4,15,16,17,18,19,31,0)
|
||||
DSCALEA5V(5,6,7,8,9,20,21,22,23,24,31,0)
|
||||
DSCALEA5V(10,11,12,13,14,25,26,27,28,29,31,0)
|
||||
LABEL(ZERO_BETA_C)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_C)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_C)
|
||||
#endif
|
||||
DSTOREC_3V_C_FWD(0,5,10,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(1,6,11,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(2,7,12,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(3,8,13,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(4,9,14,x5,0,x7)
|
||||
//
|
||||
// End of this microkernel.
|
||||
LABEL(END_WRITE_MEM)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t"
|
||||
BEQ(END_EXEC)
|
||||
" \n\t"
|
||||
" mov x8, #6 \n\t"
|
||||
" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel.
|
||||
BRANCH(MILLIKER_MLOOP)
|
||||
//
|
||||
// End of execution.
|
||||
LABEL(END_EXEC)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a] "m" (ps_a),
|
||||
[rs_b] "m" (rs_b),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
// In Clang, even "m"-passed parameter takes 1 register.
|
||||
// Have to disable prefetching to pass compilation.
|
||||
#ifndef __clang__
|
||||
[a_next] "r" (a_next),
|
||||
[b_next] "r" (b_next),
|
||||
#endif
|
||||
[m_iter] "m" (m_iter),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
consider_edge_cases:
|
||||
// Forward address.
|
||||
a = a + m_iter * ps_a;
|
||||
c = c + m_iter * 6 * rs_c;
|
||||
auxinfo_t data_d6x4mn = *data;
|
||||
bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
(
|
||||
conja, conjb, m_left, 5, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
477
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
Normal file
477
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
Normal file
@@ -0,0 +1,477 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
|
||||
// Nanokernel operations.
|
||||
#include "../armv8a_asm_d2x2.h"
|
||||
|
||||
/* Order of row-major DGEMM_6x6's execution in 2x2 blocks:
|
||||
*
|
||||
* +---+ +---+ +---+
|
||||
* | 0 | | 1 | | 2 |
|
||||
* +---+ +---+ +---+
|
||||
* +---+ +---+ +---+
|
||||
* | 3 | | 4 | | 5 |
|
||||
* +---+ +---+ +---+
|
||||
* +---+ +---+ +---+
|
||||
* | 6 | | 7 | | 8 |
|
||||
* +---+ +---+ +---+
|
||||
*
|
||||
*/
|
||||
#define DGEMM_6X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
|
||||
DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
|
||||
DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
|
||||
DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
|
||||
DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \
|
||||
DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2)
|
||||
|
||||
// Interleaving load or not.
|
||||
#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
|
||||
#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t"
|
||||
|
||||
#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
|
||||
#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
|
||||
" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
|
||||
" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
|
||||
|
||||
// Prefetch C in the long direction.
|
||||
#define DPRFMC_FWD(CADDR,DLONGC) \
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
// For row-storage of C.
|
||||
#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
|
||||
DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC)
|
||||
#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
|
||||
DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC)
|
||||
|
||||
// For column-storage of C.
|
||||
#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
DLOAD1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
DSTORE1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
|
||||
#define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \
|
||||
DSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
DSCALE2V(V4,V5,A,IDX)
|
||||
#define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \
|
||||
DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
DSCALEA2V(D4,D5,S4,S5,A,IDX)
|
||||
|
||||
|
||||
void bli_dgemmsup_rv_armv8a_asm_6x6m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( n0 == 6 );
|
||||
|
||||
// LLVM has very bad routing ability for inline asm.
|
||||
// Limit number of registers in case of Clang compilation.
|
||||
#ifndef __clang__
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
#endif
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 8;
|
||||
uint64_t k_left = k0 % 8;
|
||||
|
||||
int64_t m_iter = m0 / 6;
|
||||
int64_t m_left = m0 % 6;
|
||||
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t cs_b = cs_b0;
|
||||
assert( cs_b0 == 1 );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x10, %[a] \n\t"
|
||||
" ldr x13, %[c] \n\t"
|
||||
" ldr x12, %[m_iter] \n\t"
|
||||
" ldr x11, %[ps_a] \n\t" // Panel-skip of A.
|
||||
" ldr x9, %[rs_a] \n\t" // Row-skip of A.
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" ldr x3, %[rs_b] \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(double).
|
||||
" lsl x11, x11, #3 \n\t" // ps_a
|
||||
" lsl x9, x9, #3 \n\t" // rs_a
|
||||
" lsl x2, x2, #3 \n\t" // cs_a
|
||||
" lsl x3, x3, #3 \n\t" // rs_b
|
||||
" lsl x6, x6, #3 \n\t" // rs_c
|
||||
" lsl x7, x7, #3 \n\t" // cs_c
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t"
|
||||
" cmp x7, #8 \n\t" // Prefetch column-strided C.
|
||||
BEQ(C_PREFETCH_COLS)
|
||||
// This prefetch will not cover further mker perts. Skip.
|
||||
//
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
BRANCH(C_PREFETCH_END)
|
||||
LABEL(C_PREFETCH_COLS)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
LABEL(C_PREFETCH_END)
|
||||
//
|
||||
// Millikernel.
|
||||
LABEL(MILLIKER_MLOOP)
|
||||
" \n\t"
|
||||
" mov x0, x10 \n\t" // Parameters to be reloaded
|
||||
" mov x5, x13 \n\t" // within each millikernel loop.
|
||||
" ldr x1, %[b] \n\t"
|
||||
" ldr x4, %[k_mker] \n\t"
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:17] <- C
|
||||
// V[18:23] <- A
|
||||
// V[24:31] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(LOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(CLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" mov x14, x0 \n\t" // Load A.
|
||||
" ld1 {v18.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v18.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" \n\t"
|
||||
" ldr q24, [x1, #16*0] \n\t" // Load B.
|
||||
" ldr q25, [x1, #16*1] \n\t"
|
||||
" ldr q26, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q27, [x1, #16*0] \n\t"
|
||||
" ldr q28, [x1, #16*1] \n\t"
|
||||
" ldr q29, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" ldr q30, [x1, #16*0] \n\t"
|
||||
" ldr q31, [x1, #16*1] \n\t"
|
||||
LABEL(CLEAR_CCOLS)
|
||||
CLEAR4V(0,1,2,3)
|
||||
CLEAR2V(4,5)
|
||||
CLEAR4V(6,7,8,9)
|
||||
CLEAR2V(10,11)
|
||||
CLEAR4V(12,13,14,15)
|
||||
CLEAR2V(16,17)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(K_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2) \
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,x14,x9,x1,16*2,load) \
|
||||
"ld1 {v"#A2".d}[0], [x14], x9 \n\t" \
|
||||
"ld1 {v"#A2".d}[1], [x14], x9 \n\t" \
|
||||
"add x0, x0, x2 \n\t" \
|
||||
"mov x14, x0 \n\t" \
|
||||
"add x1, x1, x3 \n\t" \
|
||||
"ldr q"#B1", [x1, #16*0] \n\t" \
|
||||
"ldr q"#B2", [x1, #16*1] \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(K_MKER_LOOP)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,30,31,24)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,25,26,27)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,28,29,30)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(FIN_MKER_LOOP)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,31,24,25)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,26,27,28)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,29,30,31)
|
||||
BRANCH(K_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(FIN_MKER_LOOP)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,31,24,25,x14,x9,x1,16*2,load)
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,26,27,28,xzr,-1,xzr,-1,noload)
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,29,30,31,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(K_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(WRITE_MEM_PREP)
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v18.d}[0], [x14], x9 \n\t" // Load A col.
|
||||
" ld1 {v18.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v19.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v20.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q24, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q25, [x1, #16*1] \n\t"
|
||||
" ldr q26, [x1, #16*2] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(K_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(WRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v31.2d}, [x8] \n\t"
|
||||
" fmov d26, #1.0 \n\t"
|
||||
" fcmp d30, d26 \n\t"
|
||||
BEQ(UNIT_ALPHA)
|
||||
DSCALE6V(0,1,2,3,4,5,30,0)
|
||||
DSCALE6V(6,7,8,9,10,11,30,0)
|
||||
DSCALE6V(12,13,14,15,16,17,30,0)
|
||||
LABEL(UNIT_ALPHA)
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x7, #8 \n\t" // Check for column-storage.
|
||||
BNE(WRITE_MEM_C)
|
||||
//
|
||||
// C storage in rows.
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_R_1_2)
|
||||
DLOADC_3V_R_FWD(18,19,20,x1,0,x6)
|
||||
DLOADC_3V_R_FWD(21,22,23,x1,0,x6)
|
||||
DSCALEA6V(0,1,2,3,4,5,18,19,20,21,22,23,31,0)
|
||||
LABEL(ZERO_BETA_R_1_2)
|
||||
DSTOREC_3V_R_FWD(0,1,2,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(3,4,5,x5,0,x6)
|
||||
BEQ(ZERO_BETA_R_3_4_5_6)
|
||||
DLOADC_3V_R_FWD(18,19,20,x1,0,x6)
|
||||
DLOADC_3V_R_FWD(21,22,23,x1,0,x6)
|
||||
DLOADC_3V_R_FWD(0,1,2,x1,0,x6)
|
||||
DLOADC_3V_R_FWD(3,4,5,x1,0,x6)
|
||||
DSCALEA6V(6,7,8,9,10,11,18,19,20,21,22,23,31,0)
|
||||
DSCALEA6V(12,13,14,15,16,17,0,1,2,3,4,5,31,0)
|
||||
LABEL(ZERO_BETA_R_3_4_5_6)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_R)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_R)
|
||||
#endif
|
||||
DSTOREC_3V_R_FWD(6,7,8,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(9,10,11,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(12,13,14,x5,0,x6)
|
||||
DSTOREC_3V_R_FWD(15,16,17,x5,0,x6)
|
||||
BRANCH(END_WRITE_MEM)
|
||||
//
|
||||
// C storage in columns.
|
||||
LABEL(WRITE_MEM_C)
|
||||
// In-register transpose,
|
||||
// do transposition in row-order.
|
||||
" trn1 v18.2d, v0.2d, v3.2d \n\t" // Row 0-1.
|
||||
" trn2 v19.2d, v0.2d, v3.2d \n\t"
|
||||
" trn1 v20.2d, v1.2d, v4.2d \n\t"
|
||||
" trn2 v21.2d, v1.2d, v4.2d \n\t"
|
||||
" trn1 v22.2d, v2.2d, v5.2d \n\t"
|
||||
" trn2 v23.2d, v2.2d, v5.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v24.2d, v6.2d, v9.2d \n\t" // Row 2-3.
|
||||
" trn2 v25.2d, v6.2d, v9.2d \n\t"
|
||||
" trn1 v26.2d, v7.2d, v10.2d \n\t"
|
||||
" trn2 v27.2d, v7.2d, v10.2d \n\t"
|
||||
" trn1 v28.2d, v8.2d, v11.2d \n\t"
|
||||
" trn2 v29.2d, v8.2d, v11.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v0.2d, v12.2d, v15.2d \n\t" // Row 4-5.
|
||||
" trn2 v1.2d, v12.2d, v15.2d \n\t"
|
||||
" trn1 v2.2d, v13.2d, v16.2d \n\t"
|
||||
" trn2 v3.2d, v13.2d, v16.2d \n\t"
|
||||
" trn1 v4.2d, v14.2d, v17.2d \n\t"
|
||||
" trn2 v5.2d, v14.2d, v17.2d \n\t"
|
||||
" \n\t"
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_C_1_2)
|
||||
DLOADC_3V_C_FWD(6,7,8,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(9,10,11,x1,0,x7)
|
||||
DSCALEA6V(18,24,0,19,25,1,6,7,8,9,10,11,31,0)
|
||||
LABEL(ZERO_BETA_C_1_2)
|
||||
DSTOREC_3V_C_FWD(18,24,0,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(19,25,1,x5,0,x7)
|
||||
BEQ(ZERO_BETA_C_3_4_5_6)
|
||||
DLOADC_3V_C_FWD(6,7,8,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(9,10,11,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(12,13,14,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(15,16,17,x1,0,x7)
|
||||
DSCALEA6V(20,26,2,21,27,3,6,7,8,9,10,11,31,0)
|
||||
DSCALEA6V(22,28,4,23,29,5,12,13,14,15,16,17,31,0)
|
||||
LABEL(ZERO_BETA_C_3_4_5_6)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_C)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_C)
|
||||
#endif
|
||||
DSTOREC_3V_C_FWD(20,26,2,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(21,27,3,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(22,28,4,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(23,29,5,x5,0,x7)
|
||||
//
|
||||
// End of this microkernel.
|
||||
LABEL(END_WRITE_MEM)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t"
|
||||
BEQ(END_EXEC)
|
||||
" \n\t"
|
||||
" mov x8, #6 \n\t"
|
||||
" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel.
|
||||
BRANCH(MILLIKER_MLOOP)
|
||||
//
|
||||
// End of execution.
|
||||
LABEL(END_EXEC)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a] "m" (ps_a),
|
||||
[rs_b] "m" (rs_b),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
// In Clang, even "m"-passed parameter takes 1 register.
|
||||
// Have to disable prefetching to pass compilation.
|
||||
#ifndef __clang__
|
||||
[a_next] "r" (a_next),
|
||||
[b_next] "r" (b_next),
|
||||
#endif
|
||||
[m_iter] "m" (m_iter),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
consider_edge_cases:
|
||||
// Forward address.
|
||||
a = a + m_iter * ps_a;
|
||||
c = c + m_iter * 6 * rs_c;
|
||||
auxinfo_t data_d6x4mn = *data;
|
||||
bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
(
|
||||
conja, conjb, m_left, 6, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
513
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
Normal file
513
kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
Normal file
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
|
||||
// Nanokernel operations.
|
||||
#include "../armv8a_asm_d2x2.h"
|
||||
|
||||
/* Odd-NR dgemmsup_rv_*m kernels are special in that
|
||||
* despite of the row-major name, C is laid out in COLUMNS in the register space.
|
||||
*
|
||||
* Block order:
|
||||
*
|
||||
* +---+ +---+ +---+
|
||||
* | 0 | | 3 | | 6 | |9
|
||||
* +---+ +---+ +---+ |
|
||||
* +---+ +---+ +---+
|
||||
* | 1 | | 4 | | 7 | |10
|
||||
* +---+ +---+ +---+ |
|
||||
* +---+ +---+ +---+
|
||||
* | 2 | | 5 | | 8 | |11
|
||||
* +---+ +---+ +---+ |
|
||||
*
|
||||
*/
|
||||
#define DGEMM_C6X7_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C05,C06,C10,C11,C12,C13,C14,C15,C16,C20,C21,C22,C23,C24,C25,C26,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \
|
||||
DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \
|
||||
DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
|
||||
DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \
|
||||
DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \
|
||||
DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
|
||||
DGEMM_2X2_NANOKERNEL(C04,C05,A0,B2) \
|
||||
DGEMM_2X2_NANOKERNEL(C14,C15,A1,B2) \
|
||||
DGEMM_2X2_NANOKERNEL(C24,C25,A2,B2) \
|
||||
DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT+32) \
|
||||
" fmla v"#C06".2d, v"#A0".2d, v"#B3".d["#BIDX"] \n\t" \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
|
||||
" fmla v"#C16".2d, v"#A1".2d, v"#B3".d["#BIDX"] \n\t" \
|
||||
DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
|
||||
" fmla v"#C26".2d, v"#A2".2d, v"#B3".d["#BIDX"] \n\t"
|
||||
|
||||
// Interleaving load or not.
|
||||
#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
|
||||
#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t"
|
||||
|
||||
// #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
|
||||
// #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
|
||||
// DGEMM_LOAD1V_load(V1,ADDR,IMM) \
|
||||
// DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
|
||||
|
||||
#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
|
||||
#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
|
||||
" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
|
||||
" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
|
||||
|
||||
// Prefetch C in the long direction.
|
||||
#define DPRFMC_FWD(CADDR,DLONGC) \
|
||||
" prfm PLDL1KEEP, ["#CADDR"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#DLONGC" \n\t"
|
||||
|
||||
// For column-storage of C.
|
||||
#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
DLOAD1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
DSTORE1V(C2,CADDR,CSHIFT+32) \
|
||||
" add "#CADDR", "#CADDR", "#CSC" \n\t"
|
||||
|
||||
// For row-storage of C: Store 3+1/2 vectors.
|
||||
#define DLOADC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \
|
||||
DLOAD2V(C0,C1,CADDR,CSHIFT) \
|
||||
DLOAD1V(C2,CADDR,CSHIFT+32) \
|
||||
" ld1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
#define DSTOREC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \
|
||||
" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \
|
||||
DSTORE2V(C0,C1,CADDR,CSHIFT) \
|
||||
DSTORE1V(C2,CADDR,CSHIFT+32) \
|
||||
" st1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \
|
||||
" add "#CADDR", "#CADDR", "#RSC" \n\t"
|
||||
|
||||
#define DSCALE7V(V0,V1,V2,V3,V4,V5,V6,A,IDX) \
|
||||
DSCALE4V(V0,V1,V2,V3,A,IDX) \
|
||||
DSCALE2V(V4,V5,A,IDX) \
|
||||
DSCALE1V(V6,A,IDX)
|
||||
#define DSCALEA7V(D0,D1,D2,D3,D4,D5,D6,S0,S1,S2,S3,S4,S5,S6,A,IDX) \
|
||||
DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
|
||||
DSCALEA2V(D4,D5,S4,S5,A,IDX) \
|
||||
DSCALEA1V(D6,S6,A,IDX)
|
||||
#define DSCALEA3V(D0,D1,D2,S0,S1,S2,A,IDX) \
|
||||
DSCALEA2V(D0,D1,S0,S1,A,IDX) \
|
||||
DSCALEA1V(D2,S2,A,IDX)
|
||||
|
||||
|
||||
void bli_dgemmsup_rv_armv8a_asm_6x7m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m0,
|
||||
dim_t n0,
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t rs_a0, inc_t cs_a0,
|
||||
double* restrict b, inc_t rs_b0, inc_t cs_b0,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* data,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
assert( n0 == 7 );
|
||||
|
||||
// LLVM has very bad routing ability for inline asm.
|
||||
// Limit number of registers in case of Clang compilation.
|
||||
#ifndef __clang__
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
#endif
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 7;
|
||||
uint64_t k_left = k0 % 7;
|
||||
|
||||
int64_t m_iter = m0 / 6;
|
||||
int64_t m_left = m0 % 6;
|
||||
|
||||
uint64_t rs_a = rs_a0;
|
||||
uint64_t cs_a = cs_a0;
|
||||
uint64_t rs_b = rs_b0;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
// uint64_t cs_b = cs_b0;
|
||||
assert( cs_b0 == 1 );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" ldr x10, %[a] \n\t"
|
||||
" ldr x13, %[c] \n\t"
|
||||
" ldr x12, %[m_iter] \n\t"
|
||||
" ldr x11, %[ps_a] \n\t" // Panel-skip of A.
|
||||
" ldr x9, %[rs_a] \n\t" // Row-skip of A.
|
||||
" ldr x2, %[cs_a] \n\t" // Column-skip of A.
|
||||
" ldr x3, %[rs_b] \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
" ldr x6, %[rs_c] \n\t" // Row-skip of C.
|
||||
" ldr x7, %[cs_c] \n\t" // Column-skip of C.
|
||||
" \n\t"
|
||||
" \n\t" // Multiply some address skips by sizeof(double).
|
||||
" lsl x11, x11, #3 \n\t" // ps_a
|
||||
" lsl x9, x9, #3 \n\t" // rs_a
|
||||
" lsl x2, x2, #3 \n\t" // cs_a
|
||||
" lsl x3, x3, #3 \n\t" // rs_b
|
||||
" lsl x6, x6, #3 \n\t" // rs_c
|
||||
" lsl x7, x7, #3 \n\t" // cs_c
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t"
|
||||
" cmp x7, #8 \n\t" // Prefetch column-strided C.
|
||||
BNE(C_PREFETCH_COLS)
|
||||
// This prefetch will not cover further mker perts. Skip.
|
||||
//
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
// DPRFMC_FWD(x1,x6)
|
||||
BRANCH(C_PREFETCH_END)
|
||||
LABEL(C_PREFETCH_COLS)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
DPRFMC_FWD(x1,x7)
|
||||
LABEL(C_PREFETCH_END)
|
||||
//
|
||||
// Millikernel.
|
||||
LABEL(MILLIKER_MLOOP)
|
||||
" \n\t"
|
||||
" mov x0, x10 \n\t" // Parameters to be reloaded
|
||||
" mov x5, x13 \n\t" // within each millikernel loop.
|
||||
" ldr x1, %[b] \n\t"
|
||||
" ldr x4, %[k_mker] \n\t"
|
||||
" ldr x8, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
// Storage scheme:
|
||||
// V[ 0:20] <- C
|
||||
// V[21:27] <- A
|
||||
// V[28:31] <- B
|
||||
// Under this scheme, the following is defined:
|
||||
#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
|
||||
// Load from memory.
|
||||
LABEL(LOAD_ABC)
|
||||
" \n\t" // No-microkernel early return is a must
|
||||
" cmp x4, #0 \n\t" // to avoid out-of-boundary read.
|
||||
BEQ(CLEAR_CCOLS)
|
||||
" \n\t"
|
||||
" mov x14, x0 \n\t" // Load A.
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v24.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v24.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v25.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v25.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v26.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v26.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ld1 {v27.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v27.d}[1], [x14], x9 \n\t"
|
||||
" \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" ldr d31, [x1, #16*3] \n\t" // Scalar loads into idx 0.
|
||||
" add x1, x1, x3 \n\t"
|
||||
" \n\t"
|
||||
LABEL(CLEAR_CCOLS)
|
||||
CLEAR4V(0,1,2,3)
|
||||
CLEAR2V(4,5)
|
||||
CLEAR1V(6)
|
||||
CLEAR4V(7,8,9,10)
|
||||
CLEAR2V(11,12)
|
||||
CLEAR1V(13)
|
||||
CLEAR4V(14,15,16,17)
|
||||
CLEAR2V(18,19)
|
||||
CLEAR1V(20)
|
||||
// No-microkernel early return, once again.
|
||||
BEQ(K_LEFT_LOOP)
|
||||
//
|
||||
// Microkernel is defined here as:
|
||||
#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3,BIDX) \
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,x14,x9,x1,0,load) \
|
||||
"add x0, x0, x2 \n\t" \
|
||||
"mov x14, x0 \n\t" \
|
||||
"ld1 {v"#A2".d}[0], [x14], x9 \n\t" \
|
||||
"ld1 {v"#A2".d}[1], [x14], x9 \n\t" \
|
||||
/* Due to this loading, BIDX can only be 0 here. */ \
|
||||
"ldr d"#B3", [x1, #16*3] \n\t" \
|
||||
"add x1, x1, x3 \n\t"
|
||||
// Start microkernel loop.
|
||||
LABEL(K_MKER_LOOP)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,28,29,30,31,0)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31,0)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(27,21,22,28,29,30,31,0)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31,0)
|
||||
" \n\t" // Decrease counter before final replica.
|
||||
" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem.
|
||||
BEQ(FIN_MKER_LOOP)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(26,27,21,28,29,30,31,0)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,28,29,30,31,0)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31,0)
|
||||
BRANCH(K_MKER_LOOP)
|
||||
//
|
||||
// Final microkernel loop.
|
||||
LABEL(FIN_MKER_LOOP)
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(26,27,21,28,29,30,31,0,x14,x9,x1,0,load)
|
||||
" add x0, x0, x2 \n\t"
|
||||
" mov x14, x0 \n\t"
|
||||
" ldr d31, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(22,23,24,28,29,30,31,0,xzr,-1,xzr,-1,noload)
|
||||
" ldr q28, [x1, #16*0] \n\t"
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" ldr d31, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,0,xzr,-1,xzr,-1,noload)
|
||||
//
|
||||
// Loops left behind microkernels.
|
||||
LABEL(K_LEFT_LOOP)
|
||||
" cmp x8, #0 \n\t" // End of exec.
|
||||
BEQ(WRITE_MEM_PREP)
|
||||
" mov x14, x0 \n\t" // Load A col.
|
||||
" ld1 {v21.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v21.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v22.d}[1], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[0], [x14], x9 \n\t"
|
||||
" ld1 {v23.d}[1], [x14], x9 \n\t"
|
||||
" add x0, x0, x2 \n\t"
|
||||
" ldr q28, [x1, #16*0] \n\t" // Load B row.
|
||||
" ldr q29, [x1, #16*1] \n\t"
|
||||
" ldr q30, [x1, #16*2] \n\t"
|
||||
" ldr d31, [x1, #16*3] \n\t"
|
||||
" add x1, x1, x3 \n\t"
|
||||
" sub x8, x8, #1 \n\t"
|
||||
DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(21,22,23,28,29,30,31,0,xzr,-1,xzr,-1,noload)
|
||||
BRANCH(K_LEFT_LOOP)
|
||||
//
|
||||
// Scale and write to memory.
|
||||
LABEL(WRITE_MEM_PREP)
|
||||
" ldr x4, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
" ldr x8, %[beta] \n\t"
|
||||
" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta.
|
||||
" ld1r {v31.2d}, [x8] \n\t"
|
||||
" fmov d26, #1.0 \n\t"
|
||||
" fcmp d30, d26 \n\t"
|
||||
BEQ(UNIT_ALPHA)
|
||||
DSCALE7V(0,1,2,3,4,5,6,30,0)
|
||||
DSCALE7V(7,8,9,10,11,12,13,30,0)
|
||||
DSCALE7V(14,15,16,17,18,19,20,30,0)
|
||||
LABEL(UNIT_ALPHA)
|
||||
" \n\t"
|
||||
" mov x1, x5 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is x5 itself.
|
||||
" cmp x7, #8 \n\t" // Check for column-storage.
|
||||
BNE(WRITE_MEM_C)
|
||||
// Unlike other RV kernels, here row-storage of C requires
|
||||
// in-register transpose.
|
||||
" trn1 v21.2d, v0.2d, v1.2d \n\t"
|
||||
" trn2 v22.2d, v0.2d, v1.2d \n\t"
|
||||
" trn1 v23.2d, v2.2d, v3.2d \n\t"
|
||||
" trn2 v24.2d, v2.2d, v3.2d \n\t"
|
||||
" trn1 v25.2d, v4.2d, v5.2d \n\t"
|
||||
" trn2 v26.2d, v4.2d, v5.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v0.2d, v7.2d, v8.2d \n\t"
|
||||
" trn2 v1.2d, v7.2d, v8.2d \n\t"
|
||||
" trn1 v2.2d, v9.2d, v10.2d \n\t"
|
||||
" trn2 v3.2d, v9.2d, v10.2d \n\t"
|
||||
" trn1 v4.2d, v11.2d, v12.2d \n\t"
|
||||
" trn2 v5.2d, v11.2d, v12.2d \n\t"
|
||||
" \n\t"
|
||||
" trn1 v7.2d, v14.2d, v15.2d \n\t"
|
||||
" trn2 v8.2d, v14.2d, v15.2d \n\t"
|
||||
" trn1 v9.2d, v16.2d, v17.2d \n\t"
|
||||
" trn2 v10.2d, v16.2d, v17.2d \n\t"
|
||||
" trn1 v11.2d, v18.2d, v19.2d \n\t"
|
||||
" trn2 v12.2d, v18.2d, v19.2d \n\t"
|
||||
" \n\t"
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_R_1_2)
|
||||
DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8)
|
||||
DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8)
|
||||
DSCALEA7V(21,23,25,22,24,26,6,14,15,16,17,18,19,30,31,0)
|
||||
LABEL(ZERO_BETA_R_1_2)
|
||||
DSTOREC_3PHV_R_FWD(21,23,25,6,0,x5,0,x6,x8)
|
||||
DSTOREC_3PHV_R_FWD(22,24,26,6,1,x5,0,x6,x8)
|
||||
BEQ(ZERO_BETA_R_3_4_5_6)
|
||||
DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8)
|
||||
DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8)
|
||||
DLOADC_3PHV_R_FWD(21,22,23,28,0,x1,0,x6,x8)
|
||||
DLOADC_3PHV_R_FWD(24,25,26,28,1,x1,0,x6,x8)
|
||||
DSCALEA7V(0,2,4,1,3,5,13,14,15,16,17,18,19,30,31,0)
|
||||
DSCALEA7V(7,9,11,8,10,12,20,21,22,23,24,25,26,28,31,0)
|
||||
LABEL(ZERO_BETA_R_3_4_5_6)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_R)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_R)
|
||||
#endif
|
||||
DSTOREC_3PHV_R_FWD(0,2,4,13,0,x5,0,x6,x8)
|
||||
DSTOREC_3PHV_R_FWD(1,3,5,13,1,x5,0,x6,x8)
|
||||
DSTOREC_3PHV_R_FWD(7,9,11,20,0,x5,0,x6,x8)
|
||||
DSTOREC_3PHV_R_FWD(8,10,12,20,1,x5,0,x6,x8)
|
||||
BRANCH(END_WRITE_MEM)
|
||||
//
|
||||
// C storage in columns.
|
||||
LABEL(WRITE_MEM_C)
|
||||
" fcmp d31, #0.0 \n\t"
|
||||
BEQ(ZERO_BETA_C_1_2)
|
||||
DLOADC_3V_C_FWD(21,22,23,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(24,25,26,x1,0,x7)
|
||||
DSCALEA3V(0,7,14,21,22,23,31,0)
|
||||
DSCALEA3V(1,8,15,24,25,26,31,0)
|
||||
LABEL(ZERO_BETA_C_1_2)
|
||||
DSTOREC_3V_C_FWD(0,7,14,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(1,8,15,x5,0,x7)
|
||||
BEQ(ZERO_BETA_C_3_4_5_6_7)
|
||||
DLOADC_3V_C_FWD(21,22,23,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(24,25,26,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(27,28,29,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(0,7,14,x1,0,x7)
|
||||
DLOADC_3V_C_FWD(1,8,15,x1,0,x7)
|
||||
DSCALEA3V(2,9,16,21,22,23,31,0)
|
||||
DSCALEA3V(3,10,17,24,25,26,31,0)
|
||||
DSCALEA3V(4,11,18,27,28,29,31,0)
|
||||
DSCALEA3V(5,12,19,0,7,14,31,0)
|
||||
DSCALEA3V(6,13,20,1,8,15,31,0)
|
||||
LABEL(ZERO_BETA_C_3_4_5_6_7)
|
||||
#ifndef __clang__
|
||||
" cmp x12, #1 \n\t"
|
||||
BRANCH(PRFM_END_C)
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t"
|
||||
" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*0] \n\t"
|
||||
" prfm PLDL1STRM, [%[b_next], #16*1] \n\t"
|
||||
LABEL(PRFM_END_C)
|
||||
#endif
|
||||
DSTOREC_3V_C_FWD(2,9,16,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(3,10,17,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(4,11,18,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(5,12,19,x5,0,x7)
|
||||
DSTOREC_3V_C_FWD(6,13,20,x5,0,x7)
|
||||
//
|
||||
// End of this microkernel.
|
||||
LABEL(END_WRITE_MEM)
|
||||
" \n\t"
|
||||
" subs x12, x12, #1 \n\t"
|
||||
BEQ(END_EXEC)
|
||||
" \n\t"
|
||||
" mov x8, #6 \n\t"
|
||||
" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel.
|
||||
" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel.
|
||||
BRANCH(MILLIKER_MLOOP)
|
||||
//
|
||||
// End of execution.
|
||||
LABEL(END_EXEC)
|
||||
:
|
||||
: [a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[c] "m" (c),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a] "m" (ps_a),
|
||||
[rs_b] "m" (rs_b),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
// In Clang, even "m"-passed parameter takes 1 register.
|
||||
// Have to disable prefetching to pass compilation.
|
||||
#ifndef __clang__
|
||||
[a_next] "r" (a_next),
|
||||
[b_next] "r" (b_next),
|
||||
#endif
|
||||
[m_iter] "m" (m_iter),
|
||||
[k_mker] "m" (k_mker),
|
||||
[k_left] "m" (k_left),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta)
|
||||
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x8", "x9", "x10","x11","x12","x13","x14",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10","v11","v12","v13","v14","v15",
|
||||
"v16","v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
consider_edge_cases:
|
||||
// Forward address.
|
||||
a = a + m_iter * ps_a;
|
||||
c = c + m_iter * 6 * rs_c;
|
||||
auxinfo_t data_d6x4mn = *data;
|
||||
bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
(
|
||||
conja, conjb, m_left, 7, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
@@ -146,47 +145,70 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
|
||||
{
|
||||
if ( n0 != 8 )
|
||||
{
|
||||
if ( n0 < 8 )
|
||||
{
|
||||
for ( ; n0 >= 4; n0 -= 4 )
|
||||
{
|
||||
dgemmsup_ker_ft ukr_fp;
|
||||
auxinfo_t data_d8xkm = *data;
|
||||
if ( bli_auxinfo_ps_a( data ) == 6 * rs_a0 )
|
||||
{
|
||||
// Use 8x4 Asm kernel for the unpacked case.
|
||||
bli_auxinfo_set_ps_a( 8 * rs_a0, &data_d8xkm );
|
||||
ukr_fp = bli_dgemmsup_rv_armv8a_asm_8x4m;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Cannot change dimension for m when A is packed.
|
||||
ukr_fp = bli_dgemmsup_rv_armv8a_int_6x4mn;
|
||||
}
|
||||
assert( n0 <= 13 );
|
||||
|
||||
ukr_fp
|
||||
(
|
||||
conja, conjb, m0, 4, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d8xkm, cntx
|
||||
);
|
||||
b += 4 * cs_b0;
|
||||
c += 4 * cs_c0;
|
||||
}
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
(
|
||||
conja, conjb, m0, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
// Manual separation.
|
||||
dgemmsup_ker_ft ker_fp1 = NULL;
|
||||
dgemmsup_ker_ft ker_fp2 = NULL;
|
||||
dim_t nr1, nr2;
|
||||
|
||||
if ( n0 == 13 )
|
||||
{
|
||||
assert( FALSE );
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6;
|
||||
}
|
||||
if ( n0 == 12 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6;
|
||||
}
|
||||
if ( n0 == 11 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5;
|
||||
}
|
||||
if ( n0 == 10 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5;
|
||||
}
|
||||
if ( n0 == 9 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr2 = 4;
|
||||
}
|
||||
if ( n0 == 7 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7;
|
||||
}
|
||||
if ( n0 == 6 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
|
||||
}
|
||||
if ( n0 == 5 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
|
||||
}
|
||||
if ( n0 <= 4 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr1 = n0;
|
||||
}
|
||||
|
||||
ker_fp1
|
||||
(
|
||||
conja, conjb, m0, nr1, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
b += nr1 * cs_b0;
|
||||
c += nr1 * cs_c0;
|
||||
if ( ker_fp2 )
|
||||
ker_fp2
|
||||
(
|
||||
conja, conjb, m0, nr2, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -534,7 +556,6 @@ consider_edge_cases:
|
||||
// Forward address.
|
||||
a = a + m_iter * ps_a;
|
||||
c = c + m_iter * 6 * rs_c;
|
||||
#if 1
|
||||
auxinfo_t data_d6x4mn = *data;
|
||||
bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
|
||||
bli_dgemmsup_rv_armv8a_int_6x4mn
|
||||
@@ -543,33 +564,6 @@ consider_edge_cases:
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
|
||||
);
|
||||
#else
|
||||
if ( m_left >= 4 )
|
||||
{
|
||||
// Calls 4x8m with only 1 outermost loop.
|
||||
// As only 1 outermost loop is called,
|
||||
// ps_a needs not being set here.
|
||||
//
|
||||
bli_dgemmsup_rv_armv8a_asm_4x8m
|
||||
(
|
||||
conja, conjb, 4, 8, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
m_left -= 4;
|
||||
a = a + 4 * rs_a;
|
||||
c = c + 4 * rs_c;
|
||||
}
|
||||
if ( m_left )
|
||||
{
|
||||
bli_dgemmsup_r_armv8a_ref2
|
||||
(
|
||||
conja, conjb, m_left, 8, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
@@ -146,33 +145,56 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
|
||||
{
|
||||
if ( m0 != 6 )
|
||||
{
|
||||
// 5 = 4 + 1;
|
||||
// 4;
|
||||
//
|
||||
while ( m0 >= 4 )
|
||||
assert( m0 <= 9 );
|
||||
|
||||
// Manual separation.
|
||||
dgemmsup_ker_ft ker_fp1 = NULL;
|
||||
dgemmsup_ker_ft ker_fp2 = NULL;
|
||||
dim_t mr1, mr2;
|
||||
|
||||
if ( m0 == 9 )
|
||||
{
|
||||
bli_dgemmsup_rv_armv8a_asm_4x8n
|
||||
(
|
||||
conja, conjb, 4, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
m0 -= 4;
|
||||
a += 4 * rs_a0;
|
||||
c += 4 * rs_c0;
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4;
|
||||
}
|
||||
if ( m0 == 8 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4;
|
||||
}
|
||||
if ( m0 == 7 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
|
||||
ker_fp2 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr2 = 3;
|
||||
}
|
||||
if ( m0 == 5 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5;
|
||||
}
|
||||
if ( m0 == 4 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
|
||||
}
|
||||
if ( m0 < 4 )
|
||||
{
|
||||
ker_fp1 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr1 = m0;
|
||||
}
|
||||
|
||||
// 3, 2, 1;
|
||||
//
|
||||
if ( m0 > 0 )
|
||||
{
|
||||
bli_dgemmsup_rv_armv8a_int_3x8mn
|
||||
ker_fp1
|
||||
(
|
||||
conja, conjb, mr1, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
a += mr1 * rs_a0;
|
||||
c += mr1 * rs_c0;
|
||||
if ( ker_fp2 )
|
||||
ker_fp2
|
||||
(
|
||||
conja, conjb, m0, n0, k0,
|
||||
conja, conjb, mr2, n0, k0,
|
||||
alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
|
||||
beta, c, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
|
||||
|
||||
// Label locality & misc.
|
||||
#include "../armv8a_asm_utils.h"
|
||||
|
||||
@@ -39,6 +39,8 @@ PACKM_KER_PROT( double, d, packm_armv8a_int_8xk )
|
||||
|
||||
GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 )
|
||||
GEMM_UKR_PROT( float, s, gemm_armv8a_asm_12x8r )
|
||||
GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x6r )
|
||||
// GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r )
|
||||
// GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 )
|
||||
// GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 )
|
||||
@@ -47,6 +49,10 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x7m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x6m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x5m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_5x8n )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m )
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m )
|
||||
|
||||
Reference in New Issue
Block a user