From c19db2ff826e2ea6ac54569e8aa37e91bdf7cabe Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Wed, 15 Sep 2021 23:39:53 +0900 Subject: [PATCH] Arm SVE Add ZGEMM 2Vx10 Unindexed --- kernels/armsve/3/armsve_asm_2vx10cmplx.h | 114 ++++++++ .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 275 ++++++++++++++++++ kernels/armsve/bli_kernels_armsve.h | 1 + 3 files changed, 390 insertions(+) create mode 100644 kernels/armsve/3/armsve_asm_2vx10cmplx.h create mode 100644 kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c diff --git a/kernels/armsve/3/armsve_asm_2vx10cmplx.h b/kernels/armsve/3/armsve_asm_2vx10cmplx.h new file mode 100644 index 000000000..4fc695049 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_2vx10cmplx.h @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ + GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ + GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ + GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ + \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV4,BAddr,0) \ + GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV5,BAddr,2) \ + GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV6,BAddr,4) \ + GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV7,BAddr,6) \ + GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV0,BAddr,8) \ + GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV1,BAddr,10) \ + GEMM_FMLX2_LD1R(C8Im,C8Re,PT,AColRe,AColIm,BV2,BAddr,12) \ + GEMM_FMLX2_LD1R(C9Im,C9Re,PT,AColRe,AColIm,BV3,BAddr,14) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ + GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ + GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ + GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ + \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV4) \ + GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV5) \ + GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV6) \ + GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV7) \ + GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV0) \ + GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV1) \ + GEMM_FMLX2(C8Im,C8Re,PT,AColRe,AColIm,BV2) \ + GEMM_FMLX2(C9Im,C9Re,PT,AColRe,AColIm,BV3) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) + +#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \ + CLEAR_COL4(Z00,Z01,Z02,Z03) \ + CLEAR_COL4(Z04,Z05,Z06,Z07) \ + CLEAR_COL4(Z08,Z09,Z10,Z11) \ + CLEAR_COL4(Z12,Z13,Z14,Z15) \ + CLEAR_COL4(Z16,Z17,Z18,Z19) + +#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ + FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ + GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ + GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) + +#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) + +#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c new file mode 100644 index 000000000..c5e7713f6 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -0,0 +1,275 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Double-precision composite instructions. +#include "armsve_asm_macros_dcomplex.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10cmplx.h" + +#include + +void bli_zgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // TODO: Write. + assert( rs_c0 == 1 ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t info = 0; + + __asm__ volatile ( +// " ldr x0, %[a] \n\t" +// " ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incd x2, ALL, MUL #1 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +// " ldr x2, %[c] \n\t" +// " ldr x3, %[rs_c] \n\t" // Row-skip of C. +// " ldr x4, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %0, %0, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %1, %1, x16 \n\t" +" mov x16, 0x3 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr %2, %2, x16 \n\t" +#endif +" \n\t" +" mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). +" madd x2, x16, x2, xzr \n\t" // cs_a +" madd x3, x16, x3, xzr \n\t" // rs_b +" madd %4, x16, %4, xzr \n\t" // cs_c +" ptrue p0.d \n\t" +" \n\t" +// " ldr x5, %[k_mker] \n\t" // Number of loops. +// " ldr x6, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp %5, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp %3, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, %2 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp %5, #0 \n\t" // If no 4-microkernel can be applied. +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" subs %5, %5, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp %6, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" sub %6, %6, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). +// " ldr x8, %[beta] \n\t" +" ld1rd z28.d, p0/z, [%7] \n\t" // Real(alpha). +" ld1rd z29.d, p0/z, [%7, 8] \n\t" // Imag(alpha). +" ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). +" ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x9, %[a_next] \n\t" +// " ldr x10, %[b_next] \n\t" +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %9, %9, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %10, %10, x16 \n\t" +#endif +" prfm PLDL1STRM, [%9] \n\t" +" prfm PLDL1STRM, [%9, 256*1] \n\t" +" prfm PLDL1STRM, [%10] \n\t" +" prfm PLDL1STRM, [%10, 256*1] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) +GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) +GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) +GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) +GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. +" cmp %3, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +// TODO: Implement. +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_EXEC: \n\t" +" mov %11, #0 \n\t" // Return normal. +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (cs_c), // %4 + "+r" (k_mker), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (a_next), // %9 + "+r" (b_next), // %10 + "=r" (info) // %11 +: +: "x2","x3","x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 3de94cd61..65dba4caf 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -35,6 +35,7 @@ GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) +GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )