From dfa54139664a42d29774e140ec9e5597af869a76 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Tue, 30 Aug 2022 08:07:50 +0800 Subject: [PATCH] Arm64 dgemmsup with extended MR&NR (#655) Details: - Since the number of registers in NEON is large but their lengths are short, I'm here extending both MR and NR. - The approach is to represent the C microtile in registers optionally in columns, so for sizes like 6x7m, the 'crr' kernel is the default with 'rrr' supported through an in-register transpose. - A few asm kernels are crafted for 'rv' to complete this extended size support. - For 'rd' I'm still relying heavily on C99 intrinsic kernels with branching so the performance might not be optimal. (Sorry for that.) - So far, these changes only affect the 'firestorm' subconfig. - This commit also contains row-preferential s12x8 and d6x8 gemm ukernels. These microkernels are templatized versions of the existing s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c. --- config/firestorm/bli_cntx_init_firestorm.c | 32 +- kernels/armv8a/3/armv8a_asm_utils.h | 40 ++ kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 605 ++++++++++++++++++ .../sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c | 0 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c | 450 ------------- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c | 190 ++++-- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c | 266 ++++---- .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c | 3 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c | 482 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c | 475 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c | 477 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c | 513 +++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c | 128 ++-- .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c | 64 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c | 1 - kernels/armv8a/bli_kernels_armv8a.h | 6 + 16 files changed, 3020 insertions(+), 712 deletions(-) create mode 100644 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c rename kernels/armv8a/3/{ => old}/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c (100%) delete mode 100644 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c index 8e4d0088d..bfc7f24b9 100644 --- a/config/firestorm/bli_cntx_init_firestorm.c +++ b/config/firestorm/bli_cntx_init_firestorm.c @@ -49,14 +49,14 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) cntx, // level-3 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r, // packm - BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, - BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, - BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, - BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, // gemmsup BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, @@ -77,8 +77,8 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) cntx, // level-3 - BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, - BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, // gemmsup BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, @@ -95,11 +95,11 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4096, 3072, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 9600, 8184, -1, -1 ); // Initialize sup thresholds with architecture-appropriate values. // s d c z @@ -110,8 +110,10 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1, + -1, 9, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1, + -1, 13, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 ); diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 0c405dfd2..061cea66d 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -61,6 +61,18 @@ CLEAR4V(V4,V5,V6,V7) // Scale vectors. +#define SSCALE1V(V,A,IDX) \ +" fmul v"#V".4s, v"#V".4s, v"#A".s["#IDX"] \n\t" +#define SSCALE2V(V0,V1,A,IDX) \ + SSCALE1V(V0,A,IDX) \ + SSCALE1V(V1,A,IDX) +#define SSCALE4V(V0,V1,V2,V3,A,IDX) \ + SSCALE2V(V0,V1,A,IDX) \ + SSCALE2V(V2,V3,A,IDX) +#define SSCALE8V(V0,V1,V2,V3,V4,V5,V6,V7,A,IDX) \ + SSCALE4V(V0,V1,V2,V3,A,IDX) \ + SSCALE4V(V4,V5,V6,V7,A,IDX) + #define DSCALE1V(V,A,IDX) \ " fmul v"#V".2d, v"#V".2d, v"#A".d["#IDX"] \n\t" #define DSCALE2V(V0,V1,A,IDX) \ @@ -74,6 +86,18 @@ DSCALE4V(V4,V5,V6,V7,A,IDX) // Scale-accumulate. +#define SSCALEA1V(D,S,A,IDX) \ +" fmla v"#D".4s, v"#S".4s, v"#A".s["#IDX"] \n\t" +#define SSCALEA2V(D0,D1,S0,S1,A,IDX) \ + SSCALEA1V(D0,S0,A,IDX) \ + SSCALEA1V(D1,S1,A,IDX) +#define SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + SSCALEA2V(D0,D1,S0,S1,A,IDX) \ + SSCALEA2V(D2,D3,S2,S3,A,IDX) +#define SSCALEA8V(D0,D1,D2,D3,D4,D5,D6,D7,S0,S1,S2,S3,S4,S5,S6,S7,A,IDX) \ + SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + SSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) + #define DSCALEA1V(D,S,A,IDX) \ " fmla v"#D".2d, v"#S".2d, v"#A".d["#IDX"] \n\t" #define DSCALEA2V(D0,D1,S0,S1,A,IDX) \ @@ -95,8 +119,16 @@ #define DLOAD4V(V0,V1,V2,V3,ADDR,SHIFT) \ DLOAD2V(V0,V1,ADDR,SHIFT) \ DLOAD2V(V2,V3,ADDR,SHIFT+32) +#define SLOAD1V DLOAD1V +#define SLOAD2V DLOAD2V +#define SLOAD4V DLOAD4V // Generic: load one line. +#define SLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \ +" ld1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t" #define DLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \ " ld1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " ld1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" @@ -110,8 +142,16 @@ #define DSTORE4V(V0,V1,V2,V3,ADDR,SHIFT) \ DSTORE2V(V0,V1,ADDR,SHIFT) \ DSTORE2V(V2,V3,ADDR,SHIFT+32) +#define SSTORE1V DSTORE1V +#define SSTORE2V DSTORE2V +#define SSTORE4V DSTORE4V // Generic: store one line. +#define SSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \ +" st1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t" #define DSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \ " st1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " st1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c new file mode 100644 index 000000000..b0df23fb0 --- /dev/null +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -0,0 +1,605 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" + +// Label locality & misc. +#include "armv8a_asm_utils.h" + +// Nanokernel operations. +#include "armv8a_asm_d2x2.h" + +/* Order of row-major SGEMM_12x8's execution in 4x5 blocks: + * + * +---+ +---+ + * | 0 | | 1 | + * +---+ +---+ + * +---+ +---+ + * | 2 | | 3 | + * +---+ +---+ + * +---+ +---+ + * | 4 | | 5 | + * +---+ +---+ + */ +#define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \ + SGEMM_4X4_NANOKERNEL(C01,C11,C21,C31,B1,A0) \ + DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) /* Contiguous load is the same across S/D. */ \ + SGEMM_4X4_NANOKERNEL(C40,C50,C60,C70,B0,A1) \ + SGEMM_4X4_NANOKERNEL(C41,C51,C61,C71,B1,A1) \ + DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \ + SGEMM_4X4_NANOKERNEL(C80,C90,CA0,CB0,B0,A2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + SGEMM_4X4_NANOKERNEL(C81,C91,CA1,CB1,B1,A2) + +// For contiguous storage of C, SLOAD is the same as DLOAD. +#define SLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define SSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +/* Order of row-major DGEMM_8x6's execution in 2x2 blocks: + * + * +---+ +---+ +---+ + * | 0 | | 2 | | 4 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 1 | | 3 | | 5 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 6 | | 8 | | 10| + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 7 | | 9 | | 11| + * +---+ +---+ +---+ + * + */ +#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \ + DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ + DGEMM_2X2_NANOKERNEL(C60,C70,B0,A3) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ + DGEMM_2X2_NANOKERNEL(C61,C71,B1,A3) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ + DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ + DGEMM_2X2_NANOKERNEL(C62,C72,B2,A3) + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DLOAD1V(V1,ADDR,IMM) + +#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ + DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +// For contiguous storage of C. +#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +// Prefetch C. +#define PRFMC_FWD(CADDR,RSC,LASTB) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" prfm PLDL1KEEP, ["#CADDR", "#LASTB"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +void bli_sgemm_armv8a_asm_12x8r + ( + dim_t m, + dim_t n, + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + GEMM_UKR_SETUP_CT( s, 12, 8, true ); + + __asm__ volatile + ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, #12 \n\t" // Column-skip of A. +" mov x3, #8 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1) +" \n\t" +" \n\t" // Multiply some address skips by sizeof(float). +" lsl x2, x2, #2 \n\t" // cs_a +" lsl x3, x3, #2 \n\t" // rs_b +" lsl x6, x6, #2 \n\t" // rs_c +" \n\t" +" cmp %w[ct], wzr \n\t" +" mov x9, x5 \n\t" +BNE(SEND_PRFMC_FH) +PRFMC_FWD(x9,x6,32) // Prefetch C 01/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 02/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 03/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 04/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 05/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 06/12. +LABEL(SEND_PRFMC_FH) +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:23] <- C +// V[24:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + SGEMM_12X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(SLOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(SCLEAR_CCOLS) +" \n\t" +" ldr q24, [x0, #16*0] \n\t" // Load A. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +" ldr q27, [x0, #16*0] \n\t" +" \n\t" +" cmp %w[ct], wzr \n\t" +BNE(SEND_PRFMC_LH) +PRFMC_FWD(x9,x6,32) // Prefetch C 07/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 08/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 09/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 10/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 11/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 12/12. +LABEL(SEND_PRFMC_LH) +" cmp x4, #0 \n\t" // Reset branching flag. +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +LABEL(SCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// No-microkernel early return, once again. +BEQ(SK_LEFT_LOOP) +// +// Microkernel is defined here as: +#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1) \ + SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,x0,16,x1,0,load) \ + "add x0, x0, x2 \n\t" \ + "ldr q"#A2", [x0, #16*0] \n\t" \ + "ldr q"#B1", [x1, #16*1] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(SK_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,30,31) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(SFIN_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,30,31) +BRANCH(SK_MKER_LOOP) +// +// Final microkernel loop. +LABEL(SFIN_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,xzr,-1,xzr,-1,noload) +" ldr q26, [x0, #16*1] \n\t" +" ldr q27, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(25,26,27,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(SK_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(SWRITE_MEM_PREP) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,xzr,-1,xzr,-1,noload) +BRANCH(SK_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(SWRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v24.4s}, [x4] \n\t" // Load alpha & beta. +" ld1r {v25.4s}, [x8] \n\t" +" \n\t" +LABEL(SPREFETCH_ABNEXT) +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, +" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions +" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. +" prfm PLDL1STRM, [x1, 64*0] \n\t" +" prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*3] \n\t" +" \n\t" +" fmov d26, #1.0 \n\t" +" fcvt s26, d26 \n\t" +" fcmp s24, s26 \n\t" +BEQ(SUNIT_ALPHA) +SSCALE8V(0,1,2,3,4,5,6,7,24,0) +SSCALE8V(8,9,10,11,12,13,14,15,24,0) +SSCALE8V(16,17,18,19,20,21,22,23,24,0) +LABEL(SUNIT_ALPHA) +" \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +// +// Contiguous C-storage. +LABEL(SWRITE_MEM_R) +" fcmp s25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. +" \n\t" // This conditional flag will be used +" \n\t" // multiple times for skipping load. +// Row 0 & 1 & 2: +BEQ(SZERO_BETA_R_0_1_2) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SSCALEA2V(0,1,26,27,25,0) +SSCALEA2V(2,3,28,29,25,0) +SSCALEA2V(4,5,30,31,25,0) +LABEL(SZERO_BETA_R_0_1_2) +SSTOREC_2V_R_FWD(0,1,x5,0,x6) +SSTOREC_2V_R_FWD(2,3,x5,0,x6) +SSTOREC_2V_R_FWD(4,5,x5,0,x6) +// Row 3 & 4 & 5 & 6 & 7 & 8: +BEQ(SZERO_BETA_R_3_4_5_6_7_8) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SLOADC_2V_R_FWD(0,1,x9,0,x6) +SLOADC_2V_R_FWD(2,3,x9,0,x6) +SLOADC_2V_R_FWD(4,5,x9,0,x6) +SSCALEA4V(6,7,8,9,26,27,28,29,25,0) +SSCALEA4V(10,11,12,13,30,31,0,1,25,0) +SSCALEA4V(14,15,16,17,2,3,4,5,25,0) +LABEL(SZERO_BETA_R_3_4_5_6_7_8) +SSTOREC_2V_R_FWD(6,7,x5,0,x6) +SSTOREC_2V_R_FWD(8,9,x5,0,x6) +SSTOREC_2V_R_FWD(10,11,x5,0,x6) +SSTOREC_2V_R_FWD(12,13,x5,0,x6) +SSTOREC_2V_R_FWD(14,15,x5,0,x6) +SSTOREC_2V_R_FWD(16,17,x5,0,x6) +// Row 9 & 10 & 11 +BEQ(SZERO_BETA_R_9_10_11) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SSCALEA2V(18,19,26,27,25,0) +SSCALEA2V(20,21,28,29,25,0) +SSCALEA2V(22,23,30,31,25,0) +LABEL(SZERO_BETA_R_9_10_11) +SSTOREC_2V_R_FWD(18,19,x5,0,x6) +SSTOREC_2V_R_FWD(20,21,x5,0,x6) +SSTOREC_2V_R_FWD(22,23,x5,0,x6) +// Done. +LABEL(SEND_WRITE_MEM) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next), + [ct] "r" (_use_ct) // Defined by macro. +: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + + GEMM_UKR_FLUSH_CT( s ); +} + +/* + * Differences from the col-major 6x8 in HW modeling: + * * Stream HW prefetcher is assumed s.t. PRFM instructions for packed A&B are omitted. + */ +void bli_dgemm_armv8a_asm_8x6r + ( + dim_t m, + dim_t n, + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + GEMM_UKR_SETUP_CT( d, 8, 6, true ); + + __asm__ volatile + ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, #8 \n\t" // Column-skip of A. +" mov x3, #6 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1) +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" \n\t" +" cmp %w[ct], wzr \n\t" +" mov x9, x5 \n\t" +BNE(DEND_PRFMC) +PRFMC_FWD(x9,x6,40) // Prefetch C 1/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 2/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 3/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 4/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 5/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 6/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 7/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 8/8. +LABEL(DEND_PRFMC) +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:23] <- C +// V[24:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(DLOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(DCLEAR_CCOLS) +" \n\t" +" ldr q24, [x0, #16*0] \n\t" // Load A. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q31, [x1, #16*0] \n\t" +LABEL(DCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// No-microkernel early return, once again. +BEQ(DK_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2) \ + DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load) \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B2", [x1, #16*0] \n\t" \ + "ldr q"#A2", [x0, #16*2] \n\t" \ + "ldr q"#A3", [x0, #16*3] \n\t" \ + "add x0, x0, x2 \n\t" +// Start microkernel loop. +LABEL(DK_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(DFIN_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31) +BRANCH(DK_MKER_LOOP) +// +// Final microkernel loop. +LABEL(DFIN_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load) +" add x1, x1, x3 \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(DK_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(DWRITE_MEM_PREP) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload) +BRANCH(DK_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(DWRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v25.2d}, [x8] \n\t" +" \n\t" +LABEL(DPREFETCH_ABNEXT) +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, +" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions +" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. +" prfm PLDL1STRM, [x1, 64*0] \n\t" +" prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*3] \n\t" +" \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d24, d26 \n\t" +BEQ(DUNIT_ALPHA) +DSCALE8V(0,1,2,3,4,5,6,7,24,0) +DSCALE8V(8,9,10,11,12,13,14,15,24,0) +DSCALE8V(16,17,18,19,20,21,22,23,24,0) +LABEL(DUNIT_ALPHA) +" \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +// +// Contiguous C-storage. +LABEL(DWRITE_MEM_R) +" fcmp d25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. +" \n\t" // This conditional flag will be used +" \n\t" // multiple times for skipping load. +// Row 0 & 1: +BEQ(DZERO_BETA_R_0_1) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DSCALEA2V(0,1,26,27,25,0) +DSCALEA2V(2,3,28,29,25,0) +DSCALEA2V(4,5,30,31,25,0) +LABEL(DZERO_BETA_R_0_1) +DSTOREC_3V_R_FWD(0,1,2,x5,0,x6) +DSTOREC_3V_R_FWD(3,4,5,x5,0,x6) +// Row 2 & 3 & 4 & 5: +BEQ(DZERO_BETA_R_2_3_4_5) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DLOADC_3V_R_FWD(0,1,2,x9,0,x6) +DLOADC_3V_R_FWD(3,4,5,x9,0,x6) +DSCALEA4V(6,7,8,9,26,27,28,29,25,0) +DSCALEA4V(10,11,12,13,30,31,0,1,25,0) +DSCALEA4V(14,15,16,17,2,3,4,5,25,0) +LABEL(DZERO_BETA_R_2_3_4_5) +DSTOREC_3V_R_FWD(6,7,8,x5,0,x6) +DSTOREC_3V_R_FWD(9,10,11,x5,0,x6) +DSTOREC_3V_R_FWD(12,13,14,x5,0,x6) +DSTOREC_3V_R_FWD(15,16,17,x5,0,x6) +// Row 6 & 7 +BEQ(DZERO_BETA_R_6_7) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DSCALEA2V(18,19,26,27,25,0) +DSCALEA2V(20,21,28,29,25,0) +DSCALEA2V(22,23,30,31,25,0) +LABEL(DZERO_BETA_R_6_7) +DSTOREC_3V_R_FWD(18,19,20,x5,0,x6) +DSTOREC_3V_R_FWD(21,22,23,x5,0,x6) +// Done. +LABEL(DEND_WRITE_MEM) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next), + [ct] "r" (_use_ct) // Defined by macro. +: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + + GEMM_UKR_FLUSH_CT( d ); +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c similarity index 100% rename from kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c rename to kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c deleted file mode 100644 index 44e0ac419..000000000 --- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Separate instantiation for Armv8-A reference kernels. -// Temporary workaround. Will be removed after upstream has switched to a better way -// of exposing gemmsup interface. - -// -// -- Row storage case --------------------------------------------------------- -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - cntx_t* cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 ) - -// -// -- Column storage case ------------------------------------------------------ -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - cntx_t* cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 ) - diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c index cade3ee05..847bfe8da 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -109,6 +108,83 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3x4m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 4 ); + + for ( ; m0 >= 3; m0 -= 3 ) + { + bli_dgemmsup_rd_armv8a_asm_3x4 + ( + conja, conjb, 3, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += 3 * rs_a0; + c += 3 * rs_c0; + } + + if ( m0 > 0 ) + { + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, m0, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3xcm + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + for ( ; m0 > 0; m0 -= 3 ) + { + dim_t m_loc = ( m0 < 3 ) ? m0 : 3; + + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, m_loc, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + + a += 3 * rs_a0; + c += 3 * rs_c0; + } +} + + void bli_dgemmsup_rd_armv8a_asm_6x8m ( conj_t conja, @@ -127,58 +203,74 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m { if ( n0 != 8 ) { - if ( n0 < 8 ) + assert( n0 <= 13 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dgemmsup_ker_ft ker_fp3 = NULL; + dim_t nr1, nr2, nr3; + + switch ( n0 ) { - for ( ; n0 >= 4; n0 -= 4 ) - { - dim_t m = m0; - double *a_loc = a; - double *c_loc = c; - - for ( ; m >= 3; m -= 3 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - a_loc += 3 * rs_a0; - c_loc += 3 * rs_c0; - } - - if ( m > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, m, 4, k0, - alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - } - b += 4 * cs_b0; - c += 4 * cs_c0; - } - - for ( ; m0 > 0; m0 -= 3 ) - { - dim_t m_loc = ( m0 < 3 ) ? m0 : 3; - - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, m_loc, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - - a += 3 * rs_a0; - c += 3 * rs_c0; - } + case 13: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; + ker_fp3 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr3 = 2; break; + case 12: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr2 = 4; break; + case 11: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break; + case 10: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 9: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 1; break; + case 7: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break; + case 6: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 5: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 4: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 4; break; + default: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = n0; break; } - else + + ker_fp1 + ( + conja, conjb, m0, nr1, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr1 * cs_b0; + c += nr1 * cs_c0; + if ( ker_fp2 ) { - assert( FALSE ); + ker_fp2 + ( + conja, conjb, m0, nr2, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr2 * cs_b0; + c += nr2 * cs_c0; } + if ( ker_fp3 ) + ker_fp3 + ( + conja, conjb, m0, nr3, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c index 06c9ac32c..c4fb7cac6 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -102,6 +101,122 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_4x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 4 ); + + for ( ; n0 > 0; n0 -= 8 ) + { + // Call twice the 2xc kernel in column order. + dim_t n_loc = ( n0 < 8 ) ? n0 : 8; + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, 2, n_loc, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, 2, n_loc, k0, + alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx + ); + b += 8 * cs_b0; + c += 8 * cs_c0; + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 3 ); + + for ( ; n0 >= 4; n0 -= 4 ) + { + bli_dgemmsup_rd_armv8a_asm_3x4 + ( + conja, conjb, 3, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += 4 * cs_b0; + c += 4 * cs_c0; + } + if ( n0 > 0 ) + { + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, 3, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_rx8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 <= 2 ); + + for ( ; n0 > 0; n0 -= 8 ) + { + dim_t n_loc = ( n0 < 8 ) ? n0 : 8; + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, m0, n_loc, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += 8 * cs_b0; + c += 8 * cs_c0; + } +} + + void bli_dgemmsup_rd_armv8a_asm_6x8n ( conj_t conja, @@ -120,116 +235,51 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n { if ( m0 != 6 ) { - if ( m0 < 6 ) - { - if ( m0 == 5 ) - { - // 3xk calls. - dim_t n = n0; - double *b_loc = b; - double *c_loc = c; - for ( ; n >= 4; n -= 4 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - b_loc += 4 * cs_b0; - c_loc += 4 * cs_c0; - } - if ( n > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, 3, n, k0, - alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - } - a += 3 * rs_a0; - c += 3 * rs_c0; + assert( m0 <= 9 ); - // 2xk calls. - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - return; - } - else if ( m0 == 4 ) - { - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - } - else if ( m0 == 3 ) - { - for ( ; n0 >= 4; n0 -= 4 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 4 * cs_b0; - c += 4 * cs_c0; - } - if ( n0 > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, 3, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } - } - else // m0 == 2 or 1. - { - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, m0, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - } - } - else + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t mr1, mr2; + + switch ( m0 ) { - assert( FALSE ); + case 9: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr2 = 3; break; + case 8: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break; + case 7: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr2 = 4; break; + case 5: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break; + case 4: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr1 = 4; break; + case 3: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; break; + default: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr1 = m0; break; } + + ker_fp1 + ( + conja, conjb, mr1, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += mr1 * rs_a0; + c += mr1 * rs_c0; + if ( ker_fp2 ) + ker_fp2 + ( + conja, conjb, mr2, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c index bc7402a5f..b7d1a7d0f 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c @@ -36,7 +36,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -76,6 +75,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" +// For row-storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" @@ -83,6 +83,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" +// For column-storage of C. #define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DLOAD2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c new file mode 100644 index 000000000..eaddfd076 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c @@ -0,0 +1,482 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Order of row-major DGEMM_6x8's execution in 2x2 blocks: + * + * +---+ +---+ +---+ +---+ + * | 0 | | 1 | | 6 | | 7 | + * +---+ +---+ +---+ +---+ + * +---+ +---+ +---+ +---+ + * | 2 | | 3 | | 8 | | 9 | + * +---+ +---+ +---+ +---+ + * ----- ----- ----- ----- + * 4 5 10 11 + */ +#define DGEMM_5X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ +" fmla v"#C40".2d, v"#B0".2d, v"#A2".d[0] \n\t" \ +" fmla v"#C41".2d, v"#B1".2d, v"#A2".d[0] \n\t" \ + DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C42".2d, v"#B2".2d, v"#A2".d[0] \n\t" \ +" fmla v"#C43".2d, v"#B3".2d, v"#A2".d[0] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ + DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For row-storage of C. +#define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ + DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ + DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +// For column-storage of C: Store 2+1/2 vectors. +#define DLOADC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE1V(V4,A,IDX) +#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA1V(D4,S4,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_5x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 5 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_b = bli_auxinfo_ps_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + + int64_t n_iter = n0 / 8; + int64_t n_left = n0 % 8; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( n_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[b] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[n_iter] \n\t" +" ldr x11, %[ps_b] \n\t" // Panel-skip of B. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_b +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BEQ(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x1, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x0, %[a] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:19] <- C +// V[20:25] <- A +// V[26:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_5X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" ldr q26, [x1, #16*0] \n\t" // Load B first. +" ldr q27, [x1, #16*1] \n\t" +" ldr q28, [x1, #16*2] \n\t" +" ldr q29, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v20.d}[0], [x14], x9 \n\t" // We want A to be kept in L1. +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" ld1 {v24.d}[0], [x14], x9 \n\t" +" ld1 {v24.d}[1], [x14], x9 \n\t" +" ld1 {v25.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +LABEL(CLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR4V(16,17,18,19) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ + DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,16*2,load) \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B2", [x1, #16*0] \n\t" /* Next B line. */ \ + "ldr q"#B3", [x1, #16*1] \n\t" \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" /* Finish A line. */ \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,26,27,28,29) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,30,31,26,27) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,28,29,30,31) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,26,27,28,29) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,30,31,26,27) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,30,31,26,27,xzr,-1,xzr,-1,noload) +" ldr q30, [x1, #16*2] \n\t" +" ldr q31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(23,24,25,28,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" ldr q26, [x1, #16*0] \n\t" // Load B row. +" ldr q27, [x1, #16*1] \n\t" +" ldr q28, [x1, #16*2] \n\t" +" ldr q29, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" mov x14, x0 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" // Load A col. +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,26,27,28,29,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d20, #1.0 \n\t" +" fcmp d30, d20 \n\t" +BEQ(UNIT_ALPHA_R) +DSCALE8V(0,1,2,3,4,5,6,7,30,0) +DSCALE8V(8,9,10,11,12,13,14,15,30,0) +DSCALE4V(16,17,18,19,30,0) +LABEL(UNIT_ALPHA_R) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// +// C storage in rows. +LABEL(WRITE_MEM_R) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) +DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) +DSCALEA4V(0,1,2,3,20,21,22,23,31,0) +DSCALEA4V(4,5,6,7,24,25,26,27,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) +DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) +BEQ(ZERO_BETA_R_3_4_5) +DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) +DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) +DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) +DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,31,0) +DSCALEA4V(16,17,18,19,0,1,2,3,31,0) +LABEL(ZERO_BETA_R_3_4_5) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) +DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) +DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +// In-register transpose, +// do transposition in row-order. +" trn1 v20.2d, v0.2d, v4.2d \n\t" // Row 0-1. +" trn2 v21.2d, v0.2d, v4.2d \n\t" +" trn1 v22.2d, v1.2d, v5.2d \n\t" +" trn2 v23.2d, v1.2d, v5.2d \n\t" +" trn1 v24.2d, v2.2d, v6.2d \n\t" +" trn2 v25.2d, v2.2d, v6.2d \n\t" +" trn1 v26.2d, v3.2d, v7.2d \n\t" +" trn2 v27.2d, v3.2d, v7.2d \n\t" +" \n\t" +" trn1 v0.2d, v8.2d, v12.2d \n\t" // Row 2-3. +" trn2 v1.2d, v8.2d, v12.2d \n\t" +" trn1 v2.2d, v9.2d, v13.2d \n\t" +" trn2 v3.2d, v9.2d, v13.2d \n\t" +" trn1 v4.2d, v10.2d, v14.2d \n\t" +" trn2 v5.2d, v10.2d, v14.2d \n\t" +" trn1 v6.2d, v11.2d, v15.2d \n\t" +" trn2 v7.2d, v11.2d, v15.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2_3_4) +DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8) +DSCALEA5V(20,0,21,1,16,8,9,11,12,10,31,0) +DSCALEA5V(22,2,23,3,17,13,14,28,29,15,31,0) +LABEL(ZERO_BETA_C_1_2_3_4) +DSTOREC_2PHV_C_FWD(20,0,16,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(21,1,16,1,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(22,2,17,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(23,3,17,1,x5,0,x7,x8) +BEQ(ZERO_BETA_C_5_6_7_8) +DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8) +DSCALEA5V(24,4,25,5,18,8,9,11,12,10,31,0) +DSCALEA5V(26,6,27,7,19,13,14,28,29,15,31,0) +LABEL(ZERO_BETA_C_5_6_7_8) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_2PHV_C_FWD(24,4,18,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(25,5,18,1,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(26,6,19,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(27,7,19,1,x5,0,x7,x8) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #8 \n\t" +" madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward B's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_b] "m" (ps_b), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [n_iter] "m" (n_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + b = b + n_iter * ps_b; + c = c + n_iter * 8 * cs_c; + if ( n_left ) + { + // Set panel stride to unpacked mode. + // Only 1 millikernel w.r.t. 6x8 is executed. + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + // + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, 5, n_left, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + } + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c new file mode 100644 index 000000000..91d6ca596 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c @@ -0,0 +1,475 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Odd-NR dgemmsup_rv_*m kernels are special in that + * despite of the row-major name, C is laid out in COLUMNS in the register space. + * + * Block order: + * + * +---+ +---+ + * | 0 | | 3 | |6 + * +---+ +---+ | + * +---+ +---+ + * | 1 | | 4 | |7 + * +---+ +---+ | + * +---+ +---+ + * | 2 | | 5 | |8 + * +---+ +---+ | + * + */ +#define DGEMM_C6X5_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C10,C11,C12,C13,C14,C20,C21,C22,C23,C24,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ + DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ + DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ + DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ + DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ +" fmla v"#C04".2d, v"#A0".2d, v"#B2".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ +" fmla v"#C14".2d, v"#A1".2d, v"#B2".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C24".2d, v"#A2".2d, v"#B2".d["#BIDX"] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +// For row-storage of C: Store 2+1/2 vectors. +#define DLOADC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE1V(V4,A,IDX) +#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA1V(D4,S4,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x5m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 5 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BNE(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:14] <- C +// V[15:23] <- A +// V[24:29] <- B +// Under this scheme, the following is defined: +#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_C6X5_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v15.d}[0], [x14], x9 \n\t" +" ld1 {v15.d}[1], [x14], x9 \n\t" +" ld1 {v16.d}[0], [x14], x9 \n\t" +" ld1 {v16.d}[1], [x14], x9 \n\t" +" ld1 {v17.d}[0], [x14], x9 \n\t" +" ld1 {v17.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v18.d}[0], [x14], x9 \n\t" +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B. +" ldr q25, [x1, #16*1] \n\t" +" ldr d26, [x1, #16*2] \n\t" // Scalar loads into idx 0. +" add x1, x1, x3 \n\t" +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr d29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR1V(4) +CLEAR4V(5,6,7,8) +CLEAR1V(9) +CLEAR4V(10,11,12,13) +CLEAR1V(14) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,BIDX) \ + DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,x14,x9,x1,0,load) \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + /* Due to this loading, BIDX can only be 0 here. */ \ + "ldr d"#B2", [x1, #16*2] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,24,25,26,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,27,28,29,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,24,25,26,0) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,27,28,29,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29,0) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,27,28,29,0,xzr,-1,xzr,-1,noload) +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr d29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,0,xzr,-1,xzr,-1,noload) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(21,22,23,27,28,29,0,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" // Load A col. +" ld1 {v15.d}[0], [x14], x9 \n\t" +" ld1 {v15.d}[1], [x14], x9 \n\t" +" ld1 {v16.d}[0], [x14], x9 \n\t" +" ld1 {v16.d}[1], [x14], x9 \n\t" +" ld1 {v17.d}[0], [x14], x9 \n\t" +" ld1 {v17.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B row. +" ldr q25, [x1, #16*1] \n\t" +" ldr d26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,24,25,26,0,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE5V(0,1,2,3,4,30,0) +DSCALE5V(5,6,7,8,9,30,0) +DSCALE5V(10,11,12,13,14,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// Unlike other RV kernels, here row-storage of C requires +// in-register transpose. +" trn1 v15.2d, v0.2d, v1.2d \n\t" +" trn2 v16.2d, v0.2d, v1.2d \n\t" +" trn1 v17.2d, v2.2d, v3.2d \n\t" +" trn2 v18.2d, v2.2d, v3.2d \n\t" +" \n\t" +" trn1 v19.2d, v5.2d, v6.2d \n\t" +" trn2 v20.2d, v5.2d, v6.2d \n\t" +" trn1 v21.2d, v7.2d, v8.2d \n\t" +" trn2 v22.2d, v7.2d, v8.2d \n\t" +" \n\t" +" trn1 v23.2d, v10.2d, v11.2d \n\t" +" trn2 v24.2d, v10.2d, v11.2d \n\t" +" trn1 v25.2d, v12.2d, v13.2d \n\t" +" trn2 v26.2d, v12.2d, v13.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R) +DLOADC_2PHV_R_FWD(0,1,28,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(2,3,28,1,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(5,6,29,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(7,8,29,1,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(10,11,30,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(12,13,30,1,x1,0,x6,x8) +DSCALEA5V(15,17,16,18,4,0,1,2,3,28,31,0) +DSCALEA5V(19,21,20,22,9,5,6,7,8,29,31,0) +DSCALEA5V(23,25,24,26,14,10,11,12,13,30,31,0) +LABEL(ZERO_BETA_R) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_2PHV_R_FWD(15,17,4,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(16,18,4,1,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(19,21,9,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(20,22,9,1,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(23,25,14,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(24,26,14,1,x5,0,x6,x8) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C) +DLOADC_3V_C_FWD(15,20,25,x1,0,x7) +DLOADC_3V_C_FWD(16,21,26,x1,0,x7) +DLOADC_3V_C_FWD(17,22,27,x1,0,x7) +DLOADC_3V_C_FWD(18,23,28,x1,0,x7) +DLOADC_3V_C_FWD(19,24,29,x1,0,x7) +DSCALEA5V(0,1,2,3,4,15,16,17,18,19,31,0) +DSCALEA5V(5,6,7,8,9,20,21,22,23,24,31,0) +DSCALEA5V(10,11,12,13,14,25,26,27,28,29,31,0) +LABEL(ZERO_BETA_C) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(0,5,10,x5,0,x7) +DSTOREC_3V_C_FWD(1,6,11,x5,0,x7) +DSTOREC_3V_C_FWD(2,7,12,x5,0,x7) +DSTOREC_3V_C_FWD(3,8,13,x5,0,x7) +DSTOREC_3V_C_FWD(4,9,14,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 5, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c new file mode 100644 index 000000000..4273030dd --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c @@ -0,0 +1,477 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Order of row-major DGEMM_6x6's execution in 2x2 blocks: + * + * +---+ +---+ +---+ + * | 0 | | 1 | | 2 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 3 | | 4 | | 5 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 6 | | 7 | | 8 | + * +---+ +---+ +---+ + * + */ +#define DGEMM_6X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ + DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For row-storage of C. +#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) +#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +#define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE2V(V4,V5,A,IDX) +#define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA2V(D4,D5,S4,S5,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x6m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 6 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 8; + uint64_t k_left = k0 % 8; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BEQ(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:17] <- C +// V[18:23] <- A +// V[24:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_6X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v18.d}[0], [x14], x9 \n\t" +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B. +" ldr q25, [x1, #16*1] \n\t" +" ldr q26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr q29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR2V(4,5) +CLEAR4V(6,7,8,9) +CLEAR2V(10,11) +CLEAR4V(12,13,14,15) +CLEAR2V(16,17) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2) \ + DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,x14,x9,x1,16*2,load) \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B1", [x1, #16*0] \n\t" \ + "ldr q"#B2", [x1, #16*1] \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,30,31,24) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,25,26,27) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,28,29,30) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,31,24,25) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,26,27,28) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,29,30,31) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,31,24,25,x14,x9,x1,16*2,load) +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" add x1, x1, x3 \n\t" +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,26,27,28,xzr,-1,xzr,-1,noload) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" +" ld1 {v18.d}[0], [x14], x9 \n\t" // Load A col. +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B row. +" ldr q25, [x1, #16*1] \n\t" +" ldr q26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE6V(0,1,2,3,4,5,30,0) +DSCALE6V(6,7,8,9,10,11,30,0) +DSCALE6V(12,13,14,15,16,17,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// +// C storage in rows. +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_3V_R_FWD(18,19,20,x1,0,x6) +DLOADC_3V_R_FWD(21,22,23,x1,0,x6) +DSCALEA6V(0,1,2,3,4,5,18,19,20,21,22,23,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_3V_R_FWD(0,1,2,x5,0,x6) +DSTOREC_3V_R_FWD(3,4,5,x5,0,x6) +BEQ(ZERO_BETA_R_3_4_5_6) +DLOADC_3V_R_FWD(18,19,20,x1,0,x6) +DLOADC_3V_R_FWD(21,22,23,x1,0,x6) +DLOADC_3V_R_FWD(0,1,2,x1,0,x6) +DLOADC_3V_R_FWD(3,4,5,x1,0,x6) +DSCALEA6V(6,7,8,9,10,11,18,19,20,21,22,23,31,0) +DSCALEA6V(12,13,14,15,16,17,0,1,2,3,4,5,31,0) +LABEL(ZERO_BETA_R_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_3V_R_FWD(6,7,8,x5,0,x6) +DSTOREC_3V_R_FWD(9,10,11,x5,0,x6) +DSTOREC_3V_R_FWD(12,13,14,x5,0,x6) +DSTOREC_3V_R_FWD(15,16,17,x5,0,x6) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +// In-register transpose, +// do transposition in row-order. +" trn1 v18.2d, v0.2d, v3.2d \n\t" // Row 0-1. +" trn2 v19.2d, v0.2d, v3.2d \n\t" +" trn1 v20.2d, v1.2d, v4.2d \n\t" +" trn2 v21.2d, v1.2d, v4.2d \n\t" +" trn1 v22.2d, v2.2d, v5.2d \n\t" +" trn2 v23.2d, v2.2d, v5.2d \n\t" +" \n\t" +" trn1 v24.2d, v6.2d, v9.2d \n\t" // Row 2-3. +" trn2 v25.2d, v6.2d, v9.2d \n\t" +" trn1 v26.2d, v7.2d, v10.2d \n\t" +" trn2 v27.2d, v7.2d, v10.2d \n\t" +" trn1 v28.2d, v8.2d, v11.2d \n\t" +" trn2 v29.2d, v8.2d, v11.2d \n\t" +" \n\t" +" trn1 v0.2d, v12.2d, v15.2d \n\t" // Row 4-5. +" trn2 v1.2d, v12.2d, v15.2d \n\t" +" trn1 v2.2d, v13.2d, v16.2d \n\t" +" trn2 v3.2d, v13.2d, v16.2d \n\t" +" trn1 v4.2d, v14.2d, v17.2d \n\t" +" trn2 v5.2d, v14.2d, v17.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2) +DLOADC_3V_C_FWD(6,7,8,x1,0,x7) +DLOADC_3V_C_FWD(9,10,11,x1,0,x7) +DSCALEA6V(18,24,0,19,25,1,6,7,8,9,10,11,31,0) +LABEL(ZERO_BETA_C_1_2) +DSTOREC_3V_C_FWD(18,24,0,x5,0,x7) +DSTOREC_3V_C_FWD(19,25,1,x5,0,x7) +BEQ(ZERO_BETA_C_3_4_5_6) +DLOADC_3V_C_FWD(6,7,8,x1,0,x7) +DLOADC_3V_C_FWD(9,10,11,x1,0,x7) +DLOADC_3V_C_FWD(12,13,14,x1,0,x7) +DLOADC_3V_C_FWD(15,16,17,x1,0,x7) +DSCALEA6V(20,26,2,21,27,3,6,7,8,9,10,11,31,0) +DSCALEA6V(22,28,4,23,29,5,12,13,14,15,16,17,31,0) +LABEL(ZERO_BETA_C_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(20,26,2,x5,0,x7) +DSTOREC_3V_C_FWD(21,27,3,x5,0,x7) +DSTOREC_3V_C_FWD(22,28,4,x5,0,x7) +DSTOREC_3V_C_FWD(23,29,5,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 6, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c new file mode 100644 index 000000000..afdd13e28 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c @@ -0,0 +1,513 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Odd-NR dgemmsup_rv_*m kernels are special in that + * despite of the row-major name, C is laid out in COLUMNS in the register space. + * + * Block order: + * + * +---+ +---+ +---+ + * | 0 | | 3 | | 6 | |9 + * +---+ +---+ +---+ | + * +---+ +---+ +---+ + * | 1 | | 4 | | 7 | |10 + * +---+ +---+ +---+ | + * +---+ +---+ +---+ + * | 2 | | 5 | | 8 | |11 + * +---+ +---+ +---+ | + * + */ +#define DGEMM_C6X7_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C05,C06,C10,C11,C12,C13,C14,C15,C16,C20,C21,C22,C23,C24,C25,C26,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ + DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ + DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ + DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ + DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ + DGEMM_2X2_NANOKERNEL(C04,C05,A0,B2) \ + DGEMM_2X2_NANOKERNEL(C14,C15,A1,B2) \ + DGEMM_2X2_NANOKERNEL(C24,C25,A2,B2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT+32) \ +" fmla v"#C06".2d, v"#A0".2d, v"#B3".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ +" fmla v"#C16".2d, v"#A1".2d, v"#B3".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C26".2d, v"#A2".2d, v"#B3".d["#BIDX"] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +// #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +// #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ +// DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +// DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +// For row-storage of C: Store 3+1/2 vectors. +#define DLOADC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" ld1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" st1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +#define DSCALE7V(V0,V1,V2,V3,V4,V5,V6,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE2V(V4,V5,A,IDX) \ + DSCALE1V(V6,A,IDX) +#define DSCALEA7V(D0,D1,D2,D3,D4,D5,D6,S0,S1,S2,S3,S4,S5,S6,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA2V(D4,D5,S4,S5,A,IDX) \ + DSCALEA1V(D6,S6,A,IDX) +#define DSCALEA3V(D0,D1,D2,S0,S1,S2,A,IDX) \ + DSCALEA2V(D0,D1,S0,S1,A,IDX) \ + DSCALEA1V(D2,S2,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x7m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 7 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 7; + uint64_t k_left = k0 % 7; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BNE(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:20] <- C +// V[21:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_C6X7_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v24.d}[0], [x14], x9 \n\t" +" ld1 {v24.d}[1], [x14], x9 \n\t" +" ld1 {v25.d}[0], [x14], x9 \n\t" +" ld1 {v25.d}[1], [x14], x9 \n\t" +" ld1 {v26.d}[0], [x14], x9 \n\t" +" ld1 {v26.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v27.d}[0], [x14], x9 \n\t" +" ld1 {v27.d}[1], [x14], x9 \n\t" +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" // Scalar loads into idx 0. +" add x1, x1, x3 \n\t" +" \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR2V(4,5) +CLEAR1V(6) +CLEAR4V(7,8,9,10) +CLEAR2V(11,12) +CLEAR1V(13) +CLEAR4V(14,15,16,17) +CLEAR2V(18,19) +CLEAR1V(20) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3,BIDX) \ + DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,x14,x9,x1,0,load) \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + /* Due to this loading, BIDX can only be 0 here. */ \ + "ldr d"#B3", [x1, #16*3] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(27,21,22,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31,0) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(26,27,21,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31,0) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(26,27,21,28,29,30,31,0,x14,x9,x1,0,load) +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(22,23,24,28,29,30,31,0,xzr,-1,xzr,-1,noload) +" ldr q28, [x1, #16*0] \n\t" +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,0,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" // Load A col. +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(21,22,23,28,29,30,31,0,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE7V(0,1,2,3,4,5,6,30,0) +DSCALE7V(7,8,9,10,11,12,13,30,0) +DSCALE7V(14,15,16,17,18,19,20,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// Unlike other RV kernels, here row-storage of C requires +// in-register transpose. +" trn1 v21.2d, v0.2d, v1.2d \n\t" +" trn2 v22.2d, v0.2d, v1.2d \n\t" +" trn1 v23.2d, v2.2d, v3.2d \n\t" +" trn2 v24.2d, v2.2d, v3.2d \n\t" +" trn1 v25.2d, v4.2d, v5.2d \n\t" +" trn2 v26.2d, v4.2d, v5.2d \n\t" +" \n\t" +" trn1 v0.2d, v7.2d, v8.2d \n\t" +" trn2 v1.2d, v7.2d, v8.2d \n\t" +" trn1 v2.2d, v9.2d, v10.2d \n\t" +" trn2 v3.2d, v9.2d, v10.2d \n\t" +" trn1 v4.2d, v11.2d, v12.2d \n\t" +" trn2 v5.2d, v11.2d, v12.2d \n\t" +" \n\t" +" trn1 v7.2d, v14.2d, v15.2d \n\t" +" trn2 v8.2d, v14.2d, v15.2d \n\t" +" trn1 v9.2d, v16.2d, v17.2d \n\t" +" trn2 v10.2d, v16.2d, v17.2d \n\t" +" trn1 v11.2d, v18.2d, v19.2d \n\t" +" trn2 v12.2d, v18.2d, v19.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8) +DSCALEA7V(21,23,25,22,24,26,6,14,15,16,17,18,19,30,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_3PHV_R_FWD(21,23,25,6,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(22,24,26,6,1,x5,0,x6,x8) +BEQ(ZERO_BETA_R_3_4_5_6) +DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(21,22,23,28,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(24,25,26,28,1,x1,0,x6,x8) +DSCALEA7V(0,2,4,1,3,5,13,14,15,16,17,18,19,30,31,0) +DSCALEA7V(7,9,11,8,10,12,20,21,22,23,24,25,26,28,31,0) +LABEL(ZERO_BETA_R_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_3PHV_R_FWD(0,2,4,13,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(1,3,5,13,1,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(7,9,11,20,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(8,10,12,20,1,x5,0,x6,x8) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2) +DLOADC_3V_C_FWD(21,22,23,x1,0,x7) +DLOADC_3V_C_FWD(24,25,26,x1,0,x7) +DSCALEA3V(0,7,14,21,22,23,31,0) +DSCALEA3V(1,8,15,24,25,26,31,0) +LABEL(ZERO_BETA_C_1_2) +DSTOREC_3V_C_FWD(0,7,14,x5,0,x7) +DSTOREC_3V_C_FWD(1,8,15,x5,0,x7) +BEQ(ZERO_BETA_C_3_4_5_6_7) +DLOADC_3V_C_FWD(21,22,23,x1,0,x7) +DLOADC_3V_C_FWD(24,25,26,x1,0,x7) +DLOADC_3V_C_FWD(27,28,29,x1,0,x7) +DLOADC_3V_C_FWD(0,7,14,x1,0,x7) +DLOADC_3V_C_FWD(1,8,15,x1,0,x7) +DSCALEA3V(2,9,16,21,22,23,31,0) +DSCALEA3V(3,10,17,24,25,26,31,0) +DSCALEA3V(4,11,18,27,28,29,31,0) +DSCALEA3V(5,12,19,0,7,14,31,0) +DSCALEA3V(6,13,20,1,8,15,31,0) +LABEL(ZERO_BETA_C_3_4_5_6_7) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(2,9,16,x5,0,x7) +DSTOREC_3V_C_FWD(3,10,17,x5,0,x7) +DSTOREC_3V_C_FWD(4,11,18,x5,0,x7) +DSTOREC_3V_C_FWD(5,12,19,x5,0,x7) +DSTOREC_3V_C_FWD(6,13,20,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 7, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c index 8ff5ec173..b912480fa 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -146,47 +145,70 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m { if ( n0 != 8 ) { - if ( n0 < 8 ) - { - for ( ; n0 >= 4; n0 -= 4 ) - { - dgemmsup_ker_ft ukr_fp; - auxinfo_t data_d8xkm = *data; - if ( bli_auxinfo_ps_a( data ) == 6 * rs_a0 ) - { - // Use 8x4 Asm kernel for the unpacked case. - bli_auxinfo_set_ps_a( 8 * rs_a0, &data_d8xkm ); - ukr_fp = bli_dgemmsup_rv_armv8a_asm_8x4m; - } - else - { - // Cannot change dimension for m when A is packed. - ukr_fp = bli_dgemmsup_rv_armv8a_int_6x4mn; - } + assert( n0 <= 13 ); - ukr_fp - ( - conja, conjb, m0, 4, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, &data_d8xkm, cntx - ); - b += 4 * cs_b0; - c += 4 * cs_c0; - } - if ( n0 > 0 ) - { - bli_dgemmsup_rv_armv8a_int_6x4mn - ( - conja, conjb, m0, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } - } - else + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t nr1, nr2; + + if ( n0 == 13 ) { - assert( FALSE ); + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6; } + if ( n0 == 12 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6; + } + if ( n0 == 11 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5; + } + if ( n0 == 10 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5; + } + if ( n0 == 9 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr2 = 4; + } + if ( n0 == 7 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7; + } + if ( n0 == 6 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + } + if ( n0 == 5 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + } + if ( n0 <= 4 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr1 = n0; + } + + ker_fp1 + ( + conja, conjb, m0, nr1, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr1 * cs_b0; + c += nr1 * cs_c0; + if ( ker_fp2 ) + ker_fp2 + ( + conja, conjb, m0, nr2, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); return; } @@ -534,7 +556,6 @@ consider_edge_cases: // Forward address. a = a + m_iter * ps_a; c = c + m_iter * 6 * rs_c; -#if 1 auxinfo_t data_d6x4mn = *data; bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); bli_dgemmsup_rv_armv8a_int_6x4mn @@ -543,33 +564,6 @@ consider_edge_cases: alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx ); -#else - if ( m_left >= 4 ) - { - // Calls 4x8m with only 1 outermost loop. - // As only 1 outermost loop is called, - // ps_a needs not being set here. - // - bli_dgemmsup_rv_armv8a_asm_4x8m - ( - conja, conjb, 4, 8, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - m_left -= 4; - a = a + 4 * rs_a; - c = c + 4 * rs_c; - } - if ( m_left ) - { - bli_dgemmsup_r_armv8a_ref2 - ( - conja, conjb, m_left, 8, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } -#endif } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c index 9bdf4b3b8..910e07dbb 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -146,33 +145,56 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n { if ( m0 != 6 ) { - // 5 = 4 + 1; - // 4; - // - while ( m0 >= 4 ) + assert( m0 <= 9 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t mr1, mr2; + + if ( m0 == 9 ) { - bli_dgemmsup_rv_armv8a_asm_4x8n - ( - conja, conjb, 4, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - m0 -= 4; - a += 4 * rs_a0; - c += 4 * rs_c0; + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4; + } + if ( m0 == 8 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4; + } + if ( m0 == 7 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + ker_fp2 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr2 = 3; + } + if ( m0 == 5 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5; + } + if ( m0 == 4 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + } + if ( m0 < 4 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr1 = m0; } - // 3, 2, 1; - // - if ( m0 > 0 ) - { - bli_dgemmsup_rv_armv8a_int_3x8mn + ker_fp1 + ( + conja, conjb, mr1, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += mr1 * rs_a0; + c += mr1 * rs_c0; + if ( ker_fp2 ) + ker_fp2 ( - conja, conjb, m0, n0, k0, + conja, conjb, mr2, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); - } return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c index 4d374df98..d3af5781c 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c @@ -36,7 +36,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" diff --git a/kernels/armv8a/bli_kernels_armv8a.h b/kernels/armv8a/bli_kernels_armv8a.h index b7ab75541..64a3f2fb5 100644 --- a/kernels/armv8a/bli_kernels_armv8a.h +++ b/kernels/armv8a/bli_kernels_armv8a.h @@ -39,6 +39,8 @@ PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) +GEMM_UKR_PROT( float, s, gemm_armv8a_asm_12x8r ) +GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x6r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) @@ -47,6 +49,10 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x7m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x6m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x5m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m )