diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c new file mode 100644 index 000000000..c87ff1feb --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c @@ -0,0 +1,450 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Separate instantiation for Armv8-A reference kernels. +// Temporary workaround. Will be removed after upstream has switched to a better way +// of exposing gemmsup interface. + +// +// -- Row storage case --------------------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* NOTE: This microkernel can actually handle arbitrarily large + values of m, n, and k. */ \ +\ + if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ + PASTEMAC(ch,conjs)( ab ); \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 ) + +// +// -- Column storage case ------------------------------------------------------ +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* NOTE: This microkernel can actually handle arbitrarily large + values of m, n, and k. */ \ +\ + if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ + PASTEMAC(ch,conjs)( ab ); \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 ) + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c new file mode 100644 index 000000000..e3edbbd20 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c @@ -0,0 +1,412 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* + * +---+ +---+ + * | 0 | | 4 | + * +---+ +---+ + * +---+ +---+ + * | 1 | | 5 | + * +---+ +---+ + * +---+ +---+ + * | 2 | | 6 | + * +---+ +---+ + * +---+ +---+ + * | 3 | | 7 | + * +---+ +---+ + * + */ +#define DGEMM_8X4_MKER_LOOP_PLAIN(C00,C10,C20,C30,C01,C11,C21,C31,C02,C12,C22,C32,C03,C13,C23,C33,A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ + DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ + DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ + DGEMM_2X2_NANOKERNEL(C30,C31,A3,B0) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ + DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ + DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ + DGEMM_2X2_NANOKERNEL(C32,C33,A3,B1) + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DLOADC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ + DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#LDC" \n\t" +#define DSTOREC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ + DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#LDC" \n\t" + +#define DLOADC_4V_R_FWD(C00,C01,C10,C11,CADDR,CSHIFT,RSC) \ + DLOAD2V(C00,C01,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" \ + DLOAD2V(C10,C11,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_4V_R_FWD(C00,C01,C10,C11,CADDR,CSHIFT,RSC) \ + DSTORE2V(C00,C01,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" \ + DSTORE2V(C10,C11,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +void bli_dgemmsup_rv_armv8a_asm_8x4m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + if ( n0 != 4 ) + { + // TODO: Implement smaller kernels? + + bli_dgemmsup_r_armv8a_ref2 + ( + conja, conjb, m0, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + return; + } + + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + + uint64_t m_iter = m0 / 8; + uint64_t m_left = m0 % 8; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:15] <- C +// V[16:19] <- B; Allowed latency: 24 cycles / # of FPUs. +// V[20:31] <- A; Allowed latency: 32 cycles / # of FPUs. +// Under this scheme, the following is defined: +#define DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_8X4_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v24.d}[0], [x14], x9 \n\t" +" ld1 {v24.d}[1], [x14], x9 \n\t" +" ld1 {v25.d}[0], [x14], x9 \n\t" +" ld1 {v25.d}[1], [x14], x9 \n\t" +" ld1 {v26.d}[0], [x14], x9 \n\t" +" ld1 {v26.d}[1], [x14], x9 \n\t" +" ld1 {v27.d}[0], [x14], x9 \n\t" +" ld1 {v27.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v28.d}[0], [x14], x9 \n\t" +" ld1 {v28.d}[1], [x14], x9 \n\t" +" ld1 {v29.d}[0], [x14], x9 \n\t" +" ld1 {v29.d}[1], [x14], x9 \n\t" +" ld1 {v30.d}[0], [x14], x9 \n\t" +" ld1 {v30.d}[1], [x14], x9 \n\t" +" ld1 {v31.d}[0], [x14], x9 \n\t" +" ld1 {v31.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" \n\t" +" ldr q16, [x1, #16*0] \n\t" +" ldr q17, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" ldr q18, [x1, #16*0] \n\t" +" ldr q19, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +LABEL(CLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1) \ + DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,x1,0,load) \ + "mov x14, x0 \n\t" \ + "ld1 {v"#A0".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A0".d}[1], [x14], x9 \n\t" \ + "ld1 {v"#A1".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A1".d}[1], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + "ld1 {v"#A3".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A3".d}[1], [x14], x9 \n\t" \ + "ldr q"#B1", [x1, #16*1] \n\t" \ + "add x1, x1, x3 \n\t" \ + "add x0, x0, x2 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,23,16,17) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,18,19) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(28,29,30,31,16,17) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,23,18,19) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,16,17) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(28,29,30,31,18,19) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC(20,21,22,23,18,19,x1,0,load) +" ldr q19, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_8X4_MKER_LOOP_PLAIN_LOC(24,25,26,27,16,17,xzr,-1,noload) +DGEMM_8X4_MKER_LOOP_PLAIN_LOC(28,29,30,31,18,19,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" // Load A col. +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q16, [x1, #16*0] \n\t" // Load B col. +" ldr q17, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_8X4_MKER_LOOP_PLAIN_LOC(20,21,22,23,16,17,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta (value). +" ld1r {v17.2d}, [x8] \n\t" +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x6, #8 \n\t" // Check for row-storage. +BNE(WRITE_MEM_R) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +DLOADC_4V_C_FWD(20,21,22,23,x1,0,x7) +DLOADC_4V_C_FWD(24,25,26,27,x1,0,x7) +DSCALE8V(20,21,22,23,24,25,26,27,17,0) +DSCALEA8V(20,21,22,23,24,25,26,27,0,1,2,3,4,5,6,7,16,0) +// +DLOADC_4V_C_FWD(0,1,2,3,x1,0,x7) +DLOADC_4V_C_FWD(4,5,6,7,x1,0,x7) +DSCALE8V(0,1,2,3,4,5,6,7,17,0) +DSCALEA8V(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0) +// +DSTOREC_4V_C_FWD(20,21,22,23,x5,0,x7) +DSTOREC_4V_C_FWD(24,25,26,27,x5,0,x7) +DSTOREC_4V_C_FWD(0,1,2,3,x5,0,x7) +DSTOREC_4V_C_FWD(4,5,6,7,x5,0,x7) +BRANCH(END_WRITE_MEM) +// +// C storage in rows. +LABEL(WRITE_MEM_R) +// In-register transpose. +" trn1 v16.2d, v0.2d, v4.2d \n\t" // Row 0. +" trn1 v17.2d, v8.2d, v12.2d \n\t" +" trn2 v18.2d, v0.2d, v4.2d \n\t" // Row 1. +" trn2 v19.2d, v8.2d, v12.2d \n\t" +" trn1 v20.2d, v1.2d, v5.2d \n\t" // Row 2. +" trn1 v21.2d, v9.2d, v13.2d \n\t" +" trn2 v22.2d, v1.2d, v5.2d \n\t" // Row 3. +" trn2 v23.2d, v9.2d, v13.2d \n\t" +" trn1 v24.2d, v2.2d, v6.2d \n\t" // Row 4. +" trn1 v25.2d, v10.2d, v14.2d \n\t" +" trn2 v26.2d, v2.2d, v6.2d \n\t" // Row 5. +" trn2 v27.2d, v10.2d, v14.2d \n\t" +" trn1 v28.2d, v3.2d, v7.2d \n\t" // Row 6. +" trn1 v29.2d, v11.2d, v15.2d \n\t" +" trn2 v30.2d, v3.2d, v7.2d \n\t" // Row 7. +" trn2 v31.2d, v11.2d, v15.2d \n\t" +" ld1r {v14.2d}, [x4] \n\t" // Reload alpha & beta (value). +" ld1r {v15.2d}, [x8] \n\t" +DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) +DLOADC_4V_R_FWD(4,5,6,7,x1,0,x6) +DSCALE8V(0,1,2,3,4,5,6,7,15,0) +DSCALEA8V(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23,14,0) +// +DLOADC_4V_R_FWD(16,17,18,19,x1,0,x6) +DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) +DSCALE8V(16,17,18,19,20,21,22,23,15,0) +DSCALEA8V(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,14,0) +// +DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) +DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) +DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) +DSTOREC_4V_R_FWD(20,21,22,23,x5,0,x6) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #8 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // TODO: Implement. + // + + // Issue prefetch instructions only after + // execution is done. + __asm__ + ( +" mov x0, %[a_next] \n\t" +" mov x1, %[b_next] \n\t" +" prfm PLDL1STRM, [x0, #16*0] \n\t" +" prfm PLDL1STRM, [x0, #16*1] \n\t" +" prfm PLDL1STRM, [x0, #16*2] \n\t" +" prfm PLDL1KEEP, [x1, #16*0] \n\t" +" prfm PLDL1KEEP, [x1, #16*1] \n\t" +" prfm PLDL1KEEP, [x1, #16*2] \n\t" +: +: [a_next] "r" (a_next), + [b_next] "r" (b_next) +: "x0", "x1" + ); +} +