diff --git a/CMakeLists.txt b/CMakeLists.txt index 28b8c1d26..106fd21f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -354,6 +354,15 @@ if(${TARGET_ARCH} STREQUAL zen4 OR set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c PROPERTIES COMPILE_FLAGS /arch:AVX512) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ") diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 40d630425..22918c1f7 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -240,7 +240,9 @@ void bli_cntx_init_zen4( cntx_t* cntx ) bli_cntx_set_l3_sup_kers ( 30, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + // 6x8 kernels will still be used for gemmt/syrk sup + // In case of gemm, a special function will be used to override + // these blocksizes and functions with 24x8-specific ones. BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, @@ -334,6 +336,50 @@ void bli_zen4_override_trsm_blkszs (cntx_t* cntx) ); } + +// Since gemmt/syrk SUP requires block sizes to be 6x8, +// We use this function to override blocksizes and kernel functions +// with AVX-512 ones for DGEMM only. +// This function needs to be removed once checks are added around +// 6x8-specific gemmt code. +void bli_zen4_override_gemm_blkszs (cntx_t* cntx) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 144, 72, 36 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_l3_sup_blkszs + ( + 4, + // level-3 + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + bli_cntx_set_l3_sup_kers + ( + 8, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + cntx + ); +} + /* * Restore the block sizes to default values needed for zen4 context. * diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index b21d1582f..b989a4585 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -73,6 +73,8 @@ */ BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); +BLIS_EXPORT_BLIS void bli_zen4_override_gemm_blkszs (cntx_t* cntx); + /* * Restore the block sizes to default values needed for zen4 context. * diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 3ade739c4..81c0ed9c1 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -73,6 +73,7 @@ err_t bli_gemmsup trans_t transa = bli_obj_conjtrans_status( a ); trans_t transb = bli_obj_conjtrans_status( b ); + //Don't use sup for currently unsupported storage types in cgemmsup if(bli_obj_is_scomplex(c) && (((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC)) @@ -95,17 +96,49 @@ err_t bli_gemmsup } - // Obtain a valid (native) context from the gks if necessary. + // Obtain a valid context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since // that function assumes the context pointer is valid. if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + // Creating a local copy of cntx inorder to store overrided blocksizes + // and kernel fucntion pointers. + // This can be removed when we use same blocksizes and function pointers + // for all level-3 SUP routines. + cntx_t cntx_gemm = *cntx; + + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) + + if((bli_arch_query_id() == BLIS_ARCH_ZEN4) && (bli_obj_dt(a) == BLIS_DOUBLE)) + { + // This check will be removed once transpose and store of C matrix inside + // the kernel is supported. + if((stor_id == BLIS_RCC || stor_id == BLIS_CRR || stor_id == BLIS_RRC)) + { + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for dgemm."); + return BLIS_FAILURE; + } + // override the existing blocksizes with 24x8 specific ones. + // This can be removed when we use same blocksizes and function pointers + // for all level-3 SUP routines. + bli_zen4_override_gemm_blkszs(&cntx_gemm); + + // Pack A to avoid RD kernels. + if((stor_id == BLIS_CRC || stor_id == BLIS_RRC)) + { + bli_rntm_set_pack_a(1, rntm);//packa + } + } + +#endif + #ifdef AOCL_DYNAMIC // Calculating optimal nt and corresponding factorization (ic,jc) here, so // as to determine the matrix dimensions (A - m, B - n) per thread. This @@ -158,7 +191,7 @@ err_t bli_gemmsup b, beta, c, - cntx, + &cntx_gemm, rntm ); diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 91367e00e..a8b4f1b3e 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -66,6 +66,16 @@ err_t bli_gemmsup_int stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); + + const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; + const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const bool col_pref = !row_pref; + + // For row-preferred kernels, rrr_rrc_rcr_crr becomes primary + // For col-preferred kernels, rcc_crc_ccr_ccc becomes primary + const bool is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) || + ( col_pref && is_rcc_crc_ccr_ccc ); + #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); @@ -78,12 +88,11 @@ err_t bli_gemmsup_int return BLIS_FAILURE; } - if ( is_rrr_rrc_rcr_crr ) + if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels - // - Currently only row-preferential kernels are only supported. // calculate number of micropanels in m and n dimensions and // recalculate the automatic thread factorization based on these number of micropanels @@ -164,7 +173,6 @@ err_t bli_gemmsup_int // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels - // - Currently only row-preferential kernels are only supported. const dim_t mu = n / MR; // the n becomes m after a transposition const dim_t nu = m / NR; // the m becomes n after a transposition diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index 7c315192d..838707637 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -198,9 +198,28 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases } else { - //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); - printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); - bli_abort(); + if ( dt == BLIS_DOUBLE ) + { + // The optimizations are only done for CRC and RRC storage schemes to avoid RD kernels. + // Optimizations for other storage schemes is yet to be done. + if ( packa ) + { + if( *eff_id == BLIS_CRC ) + { + *eff_id = BLIS_CCC; + } + else if ( *eff_id == BLIS_RRC ) + { + *trans = bli_trans_toggled( *trans ); + *eff_id = BLIS_RCC; + } + } + } + else + { + printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels for S, C and Z datatypes.\n" ); + bli_abort(); + } } } diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index 41a408063..5a3a96c49 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -11,4 +11,4 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c ) -add_subdirectory(sup) \ No newline at end of file +add_subdirectory(sup) diff --git a/kernels/zen4/3/sup/CMakeLists.txt b/kernels/zen4/3/sup/CMakeLists.txt index dd227c1fa..a70406110 100644 --- a/kernels/zen4/3/sup/CMakeLists.txt +++ b/kernels/zen4/3/sup/CMakeLists.txt @@ -10,4 +10,7 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.h ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c - ) + ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c +) + +add_subdirectory(d24x8) diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c new file mode 100644 index 000000000..1109d7172 --- /dev/null +++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c @@ -0,0 +1,8845 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x8m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // n0 is actually n_left which is calculated at JR loop. + uint64_t n_left = (uint64_t)n0 % 8; + + // First check whether this is a edge case in the n dimension. If so, + // dispatch other nx? kernels, as needed + if( n_left ) + { + dgemmsup_ker_ft ker_fps[8] = + { + NULL, + bli_dgemmsup_rv_zen4_asm_24x1m, + bli_dgemmsup_rv_zen4_asm_24x2m, + bli_dgemmsup_rv_zen4_asm_24x3m, + bli_dgemmsup_rv_zen4_asm_24x4m, + bli_dgemmsup_rv_zen4_asm_24x5m, + bli_dgemmsup_rv_zen4_asm_24x6m, + bli_dgemmsup_rv_zen4_asm_24x7m, + }; + + dgemmsup_ker_ft ker_fp = ker_fps[ n_left ]; + + ker_fp + ( + conja, conjb, m0, n_left, k0, + alpha, abuf, rs_a0, cs_a0, bbuf, rs_b0, cs_b0, + beta, cbuf, rs_c0, cs_c0, data, cntx + ); + + return; + } + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm22, zmm22, zmm22) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21,zmm21, zmm21) + vxorpd(zmm23, zmm23, zmm23) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 8+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(8), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + vmulpd( zmm30,zmm20,zmm20 ) + vmulpd( zmm30,zmm21,zmm21 ) + vmulpd( zmm30,zmm23,zmm23 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1)) + vfmadd231pd( mem(rdx),zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vfmadd231pd( 0x40(rdx),zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vfmadd231pd( 0x80(rdx),zmm31,zmm24) + vmovupd( zmm24,0x80(rdx)) + vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1)) + vfmadd231pd( mem(rdx,rdi,2),zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vfmadd231pd( 0x40(rdx,rdi,2),zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vfmadd231pd( 0x80(rdx,rdi,2),zmm31,zmm22) + vmovupd( zmm22,0x80(rdx,rdi,2)) + vfmadd231pd( mem(rdx,r13,1),zmm31,zmm20) + vmovupd( zmm20,(rdx,r13,1)) + vfmadd231pd( 0x40(rdx,r13,1),zmm31,zmm21) + vmovupd( zmm21,0x40(rdx,r13,1)) + vfmadd231pd( 0x80(rdx,r13,1),zmm31,zmm23) + vmovupd( zmm23,0x80(rdx,r13,1)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1)) + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx)) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1)) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( zmm22,0x80(rdx,rdi,2)) + vmovupd( zmm20,(rdx,r13,1)) + vmovupd( zmm21,0x40(rdx,r13,1)) + vmovupd( zmm23,0x80(rdx,r13,1)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 8; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x8( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x8( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x8( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x7m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm22, zmm22, zmm22) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 7+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(7), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1)) + vfmadd231pd( mem(rdx),zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vfmadd231pd( 0x40(rdx),zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vfmadd231pd( 0x80(rdx),zmm31,zmm24) + vmovupd( zmm24,0x80(rdx)) + vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1)) + vfmadd231pd( mem(rdx,rdi,2),zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vfmadd231pd( 0x40(rdx,rdi,2),zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vfmadd231pd( 0x80(rdx,rdi,2),zmm31,zmm22) + vmovupd( zmm22,0x80(rdx,rdi,2)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1)) + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx)) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1)) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( zmm22,0x80(rdx,rdi,2)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 7; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x7( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x7( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x7( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x6m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(6), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1)) + vfmadd231pd( mem(rdx),zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vfmadd231pd( 0x40(rdx),zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vfmadd231pd( 0x80(rdx),zmm31,zmm24) + vmovupd( zmm24,0x80(rdx)) + vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1)) + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx)) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 6; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x6( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x6( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x6( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x5m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(5), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1)) + vfmadd231pd( mem(rdx),zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vfmadd231pd( 0x40(rdx),zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vfmadd231pd( 0x80(rdx),zmm31,zmm24) + vmovupd( zmm24,0x80(rdx)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1)) + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 5; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x5( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x5( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x5( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x4m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(4), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 4; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x4( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x4( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x4( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x3m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(3), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 3; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x3( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x3( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x3( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x2m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(2), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 2; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x2( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x2( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x2( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +void bli_dgemmsup_rv_zen4_asm_24x1m +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + double *abuf = a; + double *bbuf = b; + double *cbuf = c; + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t m_iter = (uint64_t)m0 / 24; + uint64_t m_left = (uint64_t)m0 % 24; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + if ( m_iter == 0 ) goto consider_edge_cases; + + /* For one iteration of this loop, a block of MRxNR is computed + * This loop moves along m-dimension of c matrix with steps of MR*rs_c. + */ + for(dim_t m=0; m < m_iter; m++) + { + + a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR) + b = bbuf; //Same KCXNR is used across different MRXKC in MCXKC + c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR) + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + mov(var(ps_a8), r14) // panel stride of A + lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(1), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 ) + add( r10,rax ) // a += cs_a + //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A + prefetch( 1,mem(r14) ) + prefetch( 1,0x40(r14) ) + prefetch( 1,0x80(r14) ) + add( r10,r14 ) // a_next += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vfmadd231pd( mem(rcx),zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vfmadd231pd( 0x40(rcx),zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vfmadd231pd( 0x80(rcx),zmm31,zmm28) + vmovupd( zmm28,0x80(rcx)) + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx)) + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + } //mloop + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if (m_left) + { + const dim_t nr_cur = 1; + const dim_t i_edge = m0 - ( dim_t )m_left; + double *restrict cij = cbuf + i_edge * rs_c; + double *restrict ai = abuf + m_iter * ps_a; + double *restrict bj = bbuf; + // covers the range 16 < m_left <= 24 by using masked load/store instructions + if( 16 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_24x1( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 8 < m_left <= 16 by using masked load/store instructions + else if( 8 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_16x1( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + // covers the range 0 < m_left <= 8 by using masked load/store instructions + else if( 0 < m_left ) + { + bli_dgemmsup_rv_zen4_asm_8x1( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/CMakeLists.txt b/kernels/zen4/3/sup/d24x8/CMakeLists.txt new file mode 100644 index 000000000..f5a815987 --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/CMakeLists.txt @@ -0,0 +1,13 @@ +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## + +target_sources("${PROJECT_NAME}" + PRIVATE +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx4.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx5.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c + ) \ No newline at end of file diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c new file mode 100644 index 000000000..32b443777 --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c @@ -0,0 +1,1380 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x1 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(1), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x1 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(1), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x1 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(1), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm6 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm6 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c new file mode 100644 index 000000000..898035c4f --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c @@ -0,0 +1,1650 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x2 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(2), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x2 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(2), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x2 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(2), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm8 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm8 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c new file mode 100644 index 000000000..4f5466f84 --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -0,0 +1,1920 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x3 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(3), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x3 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(3), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x3 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(3), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm10 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm30,zmm10 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c new file mode 100644 index 000000000..fb067e685 --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c @@ -0,0 +1,2196 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x4 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(4), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x4 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(4), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x4 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm12, zmm12, zmm12) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(4), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm12,zmm12 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c new file mode 100644 index 000000000..991fe53be --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c @@ -0,0 +1,2571 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x5 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(5), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm24) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x5 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(5), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x5 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm14, zmm14, zmm14) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(5), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm14 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm14 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm14,zmm14 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c new file mode 100644 index 000000000..dc874680c --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c @@ -0,0 +1,2841 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x6 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(6), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm24) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x6 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(6), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x6 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm16, zmm16, zmm16) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(6), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm16 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm16 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm16,zmm16 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c new file mode 100644 index 000000000..bc8bf3d26 --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c @@ -0,0 +1,3112 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x7 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm22, zmm22, zmm22) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 7+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(7), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm24) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( 0x40(rdx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm22) + vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x7 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 7+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(7), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x7 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm18, zmm18, zmm18) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 7+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 8 + + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(7), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + + // ---------------------------------- iteration 8 + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm30,zmm18 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm30,zmm18 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm18,zmm18 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c new file mode 100644 index 000000000..8bf041cbe --- /dev/null +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c @@ -0,0 +1,3383 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" +#define TAIL_NITER 3 + +/* These kernels Assume that A matrix needs to be in col-major order + * B matrix can be col/row-major + * C matrix can be col/row-major though support for row-major order will + * be added by a separate commit. + * Prefetch for C is done assuming that C is col-stored. + * Prefetch of B is done assuming that the matrix is col-stored. + * Prefetch for B and C matrices when row-stored is yet to be added. + * Prefetch of A matrix is not done in edge-case kernels. + */ + +void bli_dgemmsup_rv_zen4_asm_24x8 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm25, zmm25, zmm25) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm22, zmm22, zmm22) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21,zmm21, zmm21) + vxorpd(zmm23, zmm23, zmm23) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 8+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(8), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + prefetchw0( mem(rdx, 128)) // prefetch C + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 ) + vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm5,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm5,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm5,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm5,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + vfmadd231pd( zmm5,zmm31,zmm23 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 ) + vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vfmadd231pd( zmm2,zmm30,zmm28 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vfmadd231pd( zmm2,zmm31,zmm29 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vfmadd231pd( zmm2,zmm30,zmm26 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vfmadd231pd( zmm2,zmm31,zmm27 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vfmadd231pd( zmm2,zmm30,zmm24 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vfmadd231pd( zmm2,zmm31,zmm25 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + vfmadd231pd( zmm2,zmm30,zmm22 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + vfmadd231pd( zmm2,zmm31,zmm23 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm28,zmm28 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm29,zmm29 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm26,zmm26 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm27,zmm27 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm24,zmm24 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm25,zmm25 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + vmulpd( zmm30,zmm20,zmm20 ) + vmulpd( zmm30,zmm21,zmm21 ) + vmulpd( zmm30,zmm23,zmm23 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx)) + vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm28) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm29) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm26) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm27) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1) + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx)) + vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm24) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm25) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( 0x40(rdx,rdi,2),zmm1) + vfmadd231pd( zmm1,zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm2,zmm31,zmm22) + vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm20) + vmovupd( zmm20,(rdx,r13,1)) + vmovupd( 0x40(rdx,r13,1),zmm4) + vfmadd231pd( zmm4,zmm31,zmm21) + vmovupd( zmm21,0x40(rdx,r13,1)) + vmovupd( 0x80(rdx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm5,zmm31,zmm23) + vmovupd( zmm23,0x80(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx)) + vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1)) + vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2)) + vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1)) + vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx)) + vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1)) + vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2)) + vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm20,(rdx,r13,1)) + vmovupd( zmm21,0x40(rdx,r13,1)) + vmovupd( zmm23,0x80(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_16x8 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21,zmm21, zmm21) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 8+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 8 + + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(8), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 2 + prefetchw0( mem(rdx, 64)) // prefetch C + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 ) // load A + vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm4,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm4,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 ) // load A + vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vfmadd231pd( zmm1,zmm30,zmm7 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vfmadd231pd( zmm1,zmm31,zmm9 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vfmadd231pd( zmm1,zmm30,zmm11 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vfmadd231pd( zmm1,zmm31,zmm13 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vfmadd231pd( zmm1,zmm30,zmm15 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vfmadd231pd( zmm1,zmm31,zmm17 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + vfmadd231pd( zmm1,zmm30,zmm19 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + vfmadd231pd( zmm1,zmm31,zmm21 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm7,zmm7 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm9,zmm9 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm11,zmm11 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm13,zmm13 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm15,zmm15 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm17,zmm17 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm19,zmm19 ) + vmulpd( zmm30,zmm22,zmm22 ) + vmulpd( zmm30,zmm20,zmm20 ) + vmulpd( zmm30,zmm21,zmm21 ) + vmulpd( zmm30,zmm23,zmm23 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx)) + vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm7) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm9) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm11) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm13) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0) + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx)) + vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm15) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm17) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0) + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm1,zmm31,zmm19) + vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,r13,1),zmm3) + vfmadd231pd( zmm3,zmm31,zmm20) + vmovupd( zmm20,(rdx,r13,1)) + vmovupd( 0x40(rdx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm4,zmm31,zmm21) + vmovupd( zmm21,0x40(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx)) + vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1)) + vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2)) + vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1)) + vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx)) + vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1)) + vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2)) + vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm20,(rdx,r13,1)) + vmovupd( zmm21,0x40(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + + +void bli_dgemmsup_rv_zen4_asm_8x8 +( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + uint64_t k_iter = (uint64_t)k0 / 8; + uint64_t k_left = (uint64_t)k0 % 8; + + /* 8 double precision elements can be loaded into a 512-bit register + * So, we use an 8-bit mask to specify which elements to be loaded/stored + * into/from the register. m_left % 8 specifies how many number of elements + * are to be loaded/stored into/from the last register. + * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements + * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3) + * times to the right which makes the mask to be (00000111) + */ + uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left + // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored + // So, mask becomes 0xff(11111111) + if (mask == 0) mask = 0xff; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), rax) // load address of a + mov(var(cs_a), r10) // load cs_a + mov(var(b), rbx) // load address of b + mov(var(rs_b), r8) // load rs_b + mov(var(cs_b), r9) // load cs_b + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c + mov(var(mask), rdx) // load mask + kmovw(edx, k(2)) // move mask to k2 register + lea(mem(, r8, 8), r8) // rs_b *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_b *= sizeof(double) + lea(mem(, r10, 8), r10) // cs_a *= sizeof(double) + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(r9, r9, 2 ), r13) // r13 = 3*cs_b + // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b + //is also used to traverse B matrix + lea(mem(rbx, r9, 4), r12) // r12 = rbx + 4*cs_b + lea(mem(rcx, 7*8), rdx) // C for prefetching + lea(mem(rbx, r8, 8, 7*8), r11) // r11 = rbx + 8*rs_b(B for prefetching) + // if n > 4, a second pointer which point to r11 + 4*cs_b + //is also used to prefetch from B matrix + lea(mem(r11, r9, 4), r15) // r15 = r11 + 4* cs_b(B for prefetching) + + /* Register usage: zmm0-5 are used to load A matrix + * zmm6-29 are used for accumulation + * zmm30-31 are used for broadcasting B matrix + */ + + // zero out all accumulation registers + vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm20, zmm20, zmm20) + + // K is unrolled by 8 to facilitate prefetch of B + // Assuming B to be col-stored, for each iteration of K, + //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b + label(.DLOOPKITER) // main loop + mov(var(k_iter), rsi) // i = k_iter + sub(imm( 8+TAIL_NITER), rsi) // i -= NR + TAIL_NITER + jle(.PREFETCHLOOP) // jump if i <= 0 + + label(.LOOP1) + + // ---------------------------------- iteration 1 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 2 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 3 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 4 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 5 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 6 + + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 7 + + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 8 + + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP1) // iterate again if i != 0. + + label(.PREFETCHLOOP) + add(imm(8), rsi) // i += NR + jle(.TAILITER) // jump if i <= 0. + + label(.LOOP2) + + // ---------------------------------- iteration 1 + prefetchw0( mem(rdx)) // prefetch C + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + lea(mem(rdx, rdi, 1), rdx) // C += cs_c + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b + sub(imm(1), rsi) // i -= 1 + jnz(.LOOP2) // iterate again if i != 0. + label(.TAILITER) + add(imm(TAIL_NITER), rsi) // i += TAIL_NITER + jle(.TAIL) // jump if i <= 0 + + label(.LOOP3) + + // ---------------------------------- iteration 1 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 2 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 3 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 4 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r11,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 5 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 6 + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + + // ---------------------------------- iteration 7 + vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + prefetch( 0,mem(r15,r9,2) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + + // ---------------------------------- iteration 8 + prefetch( 0,mem(r15,r13,1) ) // prefetch B + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm3,zmm31,zmm20 ) + lea(mem(r11,r8,8), r11) // b_next += 8*rs_b + lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b + dec(rsi) // i -= 1 + jnz(.LOOP3) // iterate again if i != 0. + + + label(.TAIL) + mov(var(k_left), rsi) // i = k_left + test(rsi, rsi) // check i via logical AND + je(.DPOSTACCUM) // if i == 0, jump to post-accumulation + + label(.DLOOPKLEFT) // k_left loop + vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + add( r10,rax ) // a += cs_a + vbroadcastsd( mem(rbx),zmm30 ) + vbroadcastsd( mem(rbx,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm6 ) + vbroadcastsd( mem(rbx,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm8 ) + vbroadcastsd( mem(rbx,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm10 ) + vbroadcastsd( mem(r12),zmm30 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm0,zmm31,zmm12 ) + vbroadcastsd( mem(r12,r9,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm14 ) + vbroadcastsd( mem(r12,r9,2),zmm30 ) + vfmadd231pd( zmm0,zmm31,zmm16 ) + vbroadcastsd( mem(r12,r13,1),zmm31 ) + vfmadd231pd( zmm0,zmm30,zmm18 ) + add( r8,r12 ) // second pointer of b += rs_b + vfmadd231pd( zmm0,zmm31,zmm20 ) + dec(rsi) // i -= 1 + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + label(.DPOSTACCUM) + mov(var(alpha), rdx) // load address of alpha + vbroadcastsd(mem(rdx), zmm30) // broadcast alpha + mov(var(beta), rax) // load address of beta + vbroadcastsd(mem(rax), zmm31) // broadcast beta + + // scale by alpha + vmulpd( zmm30,zmm6,zmm6 ) + vmulpd( zmm30,zmm8,zmm8 ) + vmulpd( zmm30,zmm10,zmm10 ) + vmulpd( zmm30,zmm12,zmm12 ) + vmulpd( zmm30,zmm14,zmm14 ) + vmulpd( zmm30,zmm16,zmm16 ) + vmulpd( zmm30,zmm18,zmm18 ) + vmulpd( zmm30,zmm20,zmm20 ) + + + mov(var(rs_c), rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + lea(mem(rcx, rdi, 4), rdx) // rdx = rcx + 4 * cs_c + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c + vxorpd(ymm2, ymm2, ymm2) + vucomisd(xmm2, xmm31) // set ZF if beta == 0 + je(.DBETAZERO) // if ZF == 1, jump to beta == 0 case + + + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + + jz(.DROWSTORED) // jump to row storage case + + label(.DCOLSTORED) + vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm6) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm8) + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm10) + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm12) + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm14) + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm16) + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm0,zmm31,zmm18) + vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( mem(rdx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vfmadd231pd( zmm3,zmm31,zmm20) + vmovupd( zmm20,(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + label(.DROWSTORED) + + // yet to be implemented + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + cmp(imm(8), rdi) // set ZF if (8*cs_c) == 8 + + jz(.DROWSTORBZ) // jump to row storage case + label(.DCOLSTORBZ) + vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask + vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask + vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask + vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask + vmovupd( zmm20,(rdx,r13,1) MASK_(k(2))) // store to C with mask + + jmp(.DDONE) // jump to end. + + + label(.DROWSTORBZ) + + // yet to be implemented + label(.DDONE) + + + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [n0] "m" (n0), + [m0] "m" (m0), + [mask] "m" (mask) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", + "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", + "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "memory" + ) + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index fe29057ec..d9cf739c0 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -118,4 +118,47 @@ TRSMSMALL_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 ) #ifdef BLIS_ENABLE_OPENMP TRSMSMALL_PROT(trsm_small_mt_AVX512) -#endif \ No newline at end of file +#endif + +// Dgemm sup RV kernels +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2m) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1m) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x8) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x7) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x7) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x6) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x6) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x5) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x5) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x4) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x4) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x3) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x3) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x2) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x2) + +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1) +