mirror of
https://github.com/amd/blis.git
synced 2026-05-13 02:25:39 +00:00
Arm SVE Add ZGEMM 2Vx8 Unindexed
This commit is contained in:
108
kernels/armsve/3/armsve_asm_2vx8cmplx.h
Normal file
108
kernels/armsve/3/armsve_asm_2vx8cmplx.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
|
||||
GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \
|
||||
GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \
|
||||
GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \
|
||||
GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \
|
||||
" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,0) \
|
||||
GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,2) \
|
||||
GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,4) \
|
||||
GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,6) \
|
||||
\
|
||||
GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV8,BAddr,8) \
|
||||
GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV9,BAddr,10) \
|
||||
GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV10,BAddr,12) \
|
||||
GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV11,BAddr,14) \
|
||||
GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV0,BAddr,1) \
|
||||
GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV1,BAddr,3) \
|
||||
GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV2,BAddr,5) \
|
||||
GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV3,BAddr,7)
|
||||
|
||||
#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BAddr,BRSBit)
|
||||
|
||||
#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit)
|
||||
|
||||
#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
|
||||
GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \
|
||||
GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \
|
||||
GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \
|
||||
GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \
|
||||
" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \
|
||||
GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,BV4) \
|
||||
GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,BV5) \
|
||||
GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,BV6) \
|
||||
GEMM_FMLA2(C7Re,C7Im,PT,AColRe,AColIm,BV7) \
|
||||
\
|
||||
GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,BV8) \
|
||||
GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,BV9) \
|
||||
GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV10) \
|
||||
GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV11) \
|
||||
GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV0) \
|
||||
GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV1) \
|
||||
GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV2) \
|
||||
GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV3)
|
||||
|
||||
#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit)
|
||||
|
||||
#define CLEAR_COL16(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15) \
|
||||
CLEAR_COL4(Z00,Z01,Z02,Z03) \
|
||||
CLEAR_COL4(Z04,Z05,Z06,Z07) \
|
||||
CLEAR_COL4(Z08,Z09,Z10,Z11) \
|
||||
CLEAR_COL4(Z12,Z13,Z14,Z15)
|
||||
|
||||
#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
|
||||
FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \
|
||||
FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \
|
||||
GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \
|
||||
GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm)
|
||||
|
||||
#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
|
||||
GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \
|
||||
GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm)
|
||||
|
||||
#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
|
||||
GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
|
||||
GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
|
||||
|
||||
#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
|
||||
GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
|
||||
GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
|
||||
|
||||
73
kernels/armsve/3/armsve_asm_macros_cmplx.h
Normal file
73
kernels/armsve/3/armsve_asm_macros_cmplx.h
Normal file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "armsve_asm_macros.h"
|
||||
|
||||
#define FMUL_COL2(ZD0,ZD1,Z0,Z1,ZFACTOR) \
|
||||
" fmul "#ZD0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
" fmul "#ZD1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
|
||||
|
||||
#define GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" \
|
||||
" fmls "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t"
|
||||
|
||||
#define GEMM_FMLX2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
|
||||
GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
|
||||
" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
|
||||
|
||||
#define GEMM_FMLACMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \
|
||||
GEMM_FMLA2(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re) \
|
||||
GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im)
|
||||
|
||||
#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \
|
||||
" "LD2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT"/z, ["#AAddr"] \n\t"
|
||||
|
||||
#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \
|
||||
" "ST2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT", ["#AAddr"] \n\t"
|
||||
|
||||
#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,AAddr,ACS) \
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \
|
||||
" add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */
|
||||
|
||||
#define GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) \
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS)
|
||||
|
||||
#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,AAddr,ACS) \
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \
|
||||
" add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */
|
||||
|
||||
#define GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) \
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS)
|
||||
|
||||
48
kernels/armsve/3/armsve_asm_macros_dcomplex.h
Normal file
48
kernels/armsve/3/armsve_asm_macros_dcomplex.h
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
// Specify to use double precision.
|
||||
#define DT "d"
|
||||
#define LD1 "ld1d"
|
||||
#define ST1 "st1d"
|
||||
#define LD2 "ld2d"
|
||||
#define ST2 "st2d"
|
||||
#define LD1R "ld1rd"
|
||||
#define PRFG "prfd"
|
||||
#define SZ "8"
|
||||
#define OFFS "lsl #3"
|
||||
// Include macros.
|
||||
#include "armsve_asm_macros_cmplx.h"
|
||||
|
||||
281
kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
Normal file
281
kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
Normal file
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Forschunszentrum Juelich
|
||||
Copyright (C) 2020, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
// Double-precision composite instructions.
|
||||
#include "armsve_asm_macros_dcomplex.h"
|
||||
|
||||
// 2vx8 microkernels.
|
||||
#include "armsve_asm_2vx8cmplx.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
void bli_zgemm_armsve_asm_2vx8_unindexed
|
||||
(
|
||||
dim_t k0,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
// TODO: Write.
|
||||
assert( rs_c0 == 1 );
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_mker = k0 / 6;
|
||||
uint64_t k_left = k0 % 6;
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
uint64_t info = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
// " ldr x0, %[a] \n\t"
|
||||
// " ldr x1, %[b] \n\t"
|
||||
" mov x2, xzr \n\t"
|
||||
" incd x2, ALL, MUL #1 \n\t" // Column-skip of A.
|
||||
" mov x3, #8 \n\t" // Row-skip of B.
|
||||
" \n\t"
|
||||
// " ldr x2, %[c] \n\t"
|
||||
// " ldr x3, %[rs_c] \n\t" // Row-skip of C.
|
||||
// " ldr x4, %[cs_c] \n\t" // Column-skip of C.
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr %0, %0, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr %1, %1, x16 \n\t"
|
||||
" mov x16, 0x3 \n\t" // Tag C address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr %2, %2, x16 \n\t"
|
||||
#endif
|
||||
" \n\t"
|
||||
" mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex).
|
||||
" madd x2, x16, x2, xzr \n\t" // cs_a
|
||||
" madd x3, x16, x3, xzr \n\t" // rs_b
|
||||
" madd %4, x16, %4, xzr \n\t" // cs_c
|
||||
" ptrue p0.d \n\t"
|
||||
" \n\t"
|
||||
// " ldr x5, %[k_mker] \n\t" // Number of loops.
|
||||
// " ldr x6, %[k_left] \n\t"
|
||||
" \n\t"
|
||||
" LOAD_ABC: \n\t"
|
||||
" cmp %5, #0 \n\t" // Don't preload if no microkernel there.
|
||||
" b.eq END_CCOL_PRFM \n\t"
|
||||
" \n\t"
|
||||
" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real & half of imaginary.
|
||||
" ld1rd z21.d, p0/z, [%1, 8*2] \n\t"
|
||||
" ld1rd z22.d, p0/z, [%1, 8*4] \n\t"
|
||||
" ld1rd z23.d, p0/z, [%1, 8*6] \n\t"
|
||||
" ld1rd z24.d, p0/z, [%1, 8*8] \n\t"
|
||||
" ld1rd z25.d, p0/z, [%1, 8*10] \n\t"
|
||||
" ld1rd z26.d, p0/z, [%1, 8*12] \n\t"
|
||||
" ld1rd z27.d, p0/z, [%1, 8*14] \n\t"
|
||||
" ld1rd z28.d, p0/z, [%1, 8*1] \n\t"
|
||||
" ld1rd z29.d, p0/z, [%1, 8*3] \n\t"
|
||||
" ld1rd z30.d, p0/z, [%1, 8*5] \n\t"
|
||||
" ld1rd z31.d, p0/z, [%1, 8*7] \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
|
||||
" \n\t"
|
||||
" CCOL_PRFM: \n\t"
|
||||
" cmp %3, #1 \n\t"
|
||||
" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage.
|
||||
" mov x16, %2 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" add x16, x16, %5 \n\t"
|
||||
" prfm PLDL1KEEP, [x16] \n\t"
|
||||
" END_CCOL_PRFM: \n\t"
|
||||
" \n\t"
|
||||
CLEAR_COL16(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15)
|
||||
" \n\t"
|
||||
" cmp %5, #0 \n\t" // If no 4-microkernel can be applied
|
||||
" b.eq K_LEFT_LOOP \n\t"
|
||||
" \n\t"
|
||||
" K_MKER_LOOP: \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
" subs %5, %5, #1 \n\t" // Decrease counter before final replica.
|
||||
" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem.
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" b K_MKER_LOOP \n\t"
|
||||
" \n\t"
|
||||
" FIN_MKER_LOOP: \n\t"
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" \n\t"
|
||||
" K_LEFT_LOOP: \n\t"
|
||||
" cmp %6, #0 \n\t" // End of execution.
|
||||
" b.eq WRITE_MEM_PREP \n\t"
|
||||
" \n\t"
|
||||
GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
|
||||
" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Reload B's real & half of imaginary.
|
||||
" ld1rd z21.d, p0/z, [%1, 8*2] \n\t"
|
||||
" ld1rd z22.d, p0/z, [%1, 8*4] \n\t"
|
||||
" ld1rd z23.d, p0/z, [%1, 8*6] \n\t"
|
||||
" ld1rd z24.d, p0/z, [%1, 8*8] \n\t"
|
||||
" ld1rd z25.d, p0/z, [%1, 8*10] \n\t"
|
||||
" ld1rd z26.d, p0/z, [%1, 8*12] \n\t"
|
||||
" ld1rd z27.d, p0/z, [%1, 8*14] \n\t"
|
||||
" ld1rd z28.d, p0/z, [%1, 8*1] \n\t"
|
||||
" ld1rd z29.d, p0/z, [%1, 8*3] \n\t"
|
||||
" ld1rd z30.d, p0/z, [%1, 8*5] \n\t"
|
||||
" ld1rd z31.d, p0/z, [%1, 8*7] \n\t"
|
||||
GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
|
||||
" sub %6, %6, #1 \n\t"
|
||||
" b K_LEFT_LOOP \n\t" // Next column / row.
|
||||
" \n\t"
|
||||
" WRITE_MEM_PREP: \n\t"
|
||||
" \n\t"
|
||||
// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address).
|
||||
// " ldr x8, %[beta] \n\t"
|
||||
" ld1rd z16.d, p0/z, [%7] \n\t" // Real(alpha).
|
||||
" ld1rd z17.d, p0/z, [%7, 8] \n\t" // Imag(alpha).
|
||||
" ld1rd z18.d, p0/z, [%8] \n\t" // Real(beta).
|
||||
" ld1rd z19.d, p0/z, [%8, 8] \n\t" // Imag(beta).
|
||||
" \n\t"
|
||||
" PREFETCH_ABNEXT: \n\t"
|
||||
// " ldr x9, %[a_next] \n\t"
|
||||
// " ldr x10, %[b_next] \n\t"
|
||||
#ifdef _A64FX
|
||||
" mov x16, 0x1 \n\t" // Tag A address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr %9, %9, x16 \n\t"
|
||||
" mov x16, 0x2 \n\t" // Tag B address.
|
||||
" lsl x16, x16, #56 \n\t"
|
||||
" orr %10, %10, x16 \n\t"
|
||||
#endif
|
||||
" prfm PLDL1STRM, [%9] \n\t"
|
||||
" prfm PLDL1STRM, [%9, 256*1] \n\t"
|
||||
" prfm PLDL1STRM, [%10] \n\t"
|
||||
" prfm PLDL1STRM, [%10, 256*1] \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM: \n\t"
|
||||
" \n\t"
|
||||
GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z16,z17)
|
||||
GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z4 ,z5 ,z6 ,z7 ,z16,z17)
|
||||
GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z8 ,z9 ,z10,z11,z16,z17)
|
||||
GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z12,z13,z14,z15,z16,z17)
|
||||
" \n\t"
|
||||
" UNIT_ALPHA: \n\t"
|
||||
" mov x9, %2 \n\t" // C address for loading.
|
||||
" \n\t" // C address for storing is %2 itself.
|
||||
" cmp %3, #1 \n\t"
|
||||
" b.ne WRITE_MEM_G \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t"
|
||||
GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
|
||||
GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4)
|
||||
GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19)
|
||||
GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19)
|
||||
GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
|
||||
GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
|
||||
" \n\t"
|
||||
GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
|
||||
GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4)
|
||||
GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19)
|
||||
GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19)
|
||||
GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
|
||||
GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
// TODO: Implement.
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
" \n\t"
|
||||
" END_EXEC: \n\t"
|
||||
" mov %11, #0 \n\t" // Return normal.
|
||||
: "+r" (a), // %0
|
||||
"+r" (b), // %1
|
||||
"+r" (c), // %2
|
||||
"+r" (rs_c), // %3
|
||||
"+r" (cs_c), // %4
|
||||
"+r" (k_mker), // %5
|
||||
"+r" (k_left), // %6
|
||||
"+r" (alpha), // %7
|
||||
"+r" (beta), // %8
|
||||
"+r" (a_next), // %9
|
||||
"+r" (b_next), // %10
|
||||
"=r" (info) // %11
|
||||
:
|
||||
: "x2","x3","x9","x16",
|
||||
"z0","z1","z2","z3","z4","z5","z6","z7",
|
||||
"z8","z9","z10","z11","z12","z13","z14","z15",
|
||||
"z16","z17","z18","z19",
|
||||
"z20","z21","z22","z23",
|
||||
"z24","z25","z26","z27",
|
||||
"z28","z29","z30","z31"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed )
|
||||
GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed )
|
||||
//GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed )
|
||||
//GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed )
|
||||
//GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed )
|
||||
|
||||
Reference in New Issue
Block a user