From d352c746e5683037d41b5061dfb5ce08e1d0843b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 27 Aug 2013 13:41:46 -0500 Subject: [PATCH] Added single/real gemm micro-kernel for x86_64. Details: - Added a single-precision real gemm micro-kernel in kernels/x86_64/3/bli_gemm_opt_d4x4.c. - Adjusted the single-precision real register blocksizes in config/clarksville/bli_kernel.h to be 8x4. - Added a missing comment to bli_packm_blk_var2.c that was present in bli_packm_blk_var3.c --- config/clarksville/bli_kernel.h | 8 +- frame/1m/packm/bli_packm_blk_var2.c | 3 + kernels/x86_64/3/bli_gemm_opt_d4x4.c | 714 ++++++++++++++++++++++++++- 3 files changed, 698 insertions(+), 27 deletions(-) diff --git a/config/clarksville/bli_kernel.h b/config/clarksville/bli_kernel.h index 8992df6a7..dca826c64 100644 --- a/config/clarksville/bli_kernel.h +++ b/config/clarksville/bli_kernel.h @@ -54,7 +54,7 @@ // (b) NR (for triangular operations such as trmm and trsm). // -#define BLIS_DEFAULT_MC_S 256 +#define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8192 @@ -82,8 +82,8 @@ #define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) #define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) -#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/2) -#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/2) +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/2) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/2) #define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) #define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) @@ -100,7 +100,7 @@ // in the m and n dimensions should all be equal to the size expected by // the reference micro-kernel(s). -#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MR_D 4 diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index c9e394b80..6a1b15cf3 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -260,6 +260,9 @@ void PASTEMAC(ch,varname )( \ { \ diagoffc_i_abs = bli_abs( diagoffc_i ); \ \ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel, but we can probably still support those cases if + it happens. */ \ if ( ( bli_is_col_stored( rs_p, cs_p ) && diagoffc_i < 0 ) || \ ( bli_is_row_stored( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c index fcf8f76cd..f133d8e64 100644 --- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c @@ -45,7 +45,675 @@ void bli_sgemm_opt_d4x4( float* restrict b_next ) { - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + dim_t k_iter; + dim_t k_left; + + k_iter = k / 4; + k_left = k % 4; + + __asm__ volatile + ( + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + "movq %9, %%r9 \n\t" // load address of b_next. + " \n\t" + "subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte + "subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. + " \n\t" + "movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements + "movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. + "movaps -8 * 16(%%rbx), %%xmm2 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %8, %%rdi \n\t" // load cs_c + "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) + "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; + " \n\t" + "prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next + " \n\t" + "xorpd %%xmm3, %%xmm3 \n\t" + "xorpd %%xmm4, %%xmm4 \n\t" + "xorpd %%xmm5, %%xmm5 \n\t" + "xorpd %%xmm6, %%xmm6 \n\t" + " \n\t" + "prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c + "xorpd %%xmm8, %%xmm8 \n\t" + "movaps %%xmm8, %%xmm9 \n\t" + "prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "movaps %%xmm8, %%xmm10 \n\t" + "movaps %%xmm8, %%xmm11 \n\t" + "prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c + "movaps %%xmm8, %%xmm12 \n\t" + "movaps %%xmm8, %%xmm13 \n\t" + "prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c + "movaps %%xmm8, %%xmm14 \n\t" + "movaps %%xmm8, %%xmm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".SLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + "prefetcht0 (4*35+1) * 8(%%rax) \n\t" + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" // iteration 0 + "addps %%xmm3, %%xmm14 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "pshufd $0x39, %%xmm2, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + "movaps %%xmm7, %%xmm5 \n\t" + "pshufd $0x39, %%xmm7, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm7 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + " \n\t" + "addps %%xmm2, %%xmm8 \n\t" + "movaps -7 * 16(%%rbx), %%xmm2 \n\t" + "addps %%xmm3, %%xmm12 \n\t" + "movaps %%xmm6, %%xmm3 \n\t" + "pshufd $0x39, %%xmm6, %%xmm4 \n\t" + "mulps %%xmm0, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm7, %%xmm9 \n\t" + "addps %%xmm5, %%xmm13 \n\t" + "movaps %%xmm4, %%xmm5 \n\t" + "mulps %%xmm0, %%xmm4 \n\t" + "movaps -6 * 16(%%rax), %%xmm0 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "movaps -5 * 16(%%rax), %%xmm1 \n\t" + " \n\t" + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" // iteration 1 + "addps %%xmm3, %%xmm14 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "pshufd $0x39, %%xmm2, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + "movaps %%xmm7, %%xmm5 \n\t" + "pshufd $0x39, %%xmm7, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm7 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + " \n\t" + "addps %%xmm2, %%xmm8 \n\t" + "movaps -6 * 16(%%rbx), %%xmm2 \n\t" + "addps %%xmm3, %%xmm12 \n\t" + "movaps %%xmm6, %%xmm3 \n\t" + "pshufd $0x39, %%xmm6, %%xmm4 \n\t" + "mulps %%xmm0, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm7, %%xmm9 \n\t" + "addps %%xmm5, %%xmm13 \n\t" + "movaps %%xmm4, %%xmm5 \n\t" + "mulps %%xmm0, %%xmm4 \n\t" + "movaps -4 * 16(%%rax), %%xmm0 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "movaps -3 * 16(%%rax), %%xmm1 \n\t" + " \n\t" + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" // iteration 2 + "addps %%xmm3, %%xmm14 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "pshufd $0x39, %%xmm2, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + "movaps %%xmm7, %%xmm5 \n\t" + "pshufd $0x39, %%xmm7, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm7 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + " \n\t" + "addps %%xmm2, %%xmm8 \n\t" + "movaps -5 * 16(%%rbx), %%xmm2 \n\t" + "addps %%xmm3, %%xmm12 \n\t" + "movaps %%xmm6, %%xmm3 \n\t" + "pshufd $0x39, %%xmm6, %%xmm4 \n\t" + "mulps %%xmm0, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm7, %%xmm9 \n\t" + "addps %%xmm5, %%xmm13 \n\t" + "movaps %%xmm4, %%xmm5 \n\t" + "mulps %%xmm0, %%xmm4 \n\t" + "movaps -2 * 16(%%rax), %%xmm0 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "movaps -1 * 16(%%rax), %%xmm1 \n\t" + " \n\t" + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" // iteration 3 + "addps %%xmm3, %%xmm14 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "pshufd $0x39, %%xmm2, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "subq $-4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr) + " \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + "movaps %%xmm7, %%xmm5 \n\t" + "pshufd $0x39, %%xmm7, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm7 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + " \n\t" + "subq $-4 * 4 * 4, %%r9 \n\t" // b_next += 4*4 (unroll x nr) + " \n\t" + "addps %%xmm2, %%xmm8 \n\t" + "movaps -4 * 16(%%rbx), %%xmm2 \n\t" + "addps %%xmm3, %%xmm12 \n\t" + "movaps %%xmm6, %%xmm3 \n\t" + "pshufd $0x39, %%xmm6, %%xmm4 \n\t" + "mulps %%xmm0, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "subq $-4 * 4 * 4, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + "addps %%xmm7, %%xmm9 \n\t" + "addps %%xmm5, %%xmm13 \n\t" + "movaps %%xmm4, %%xmm5 \n\t" + "mulps %%xmm0, %%xmm4 \n\t" + "movaps -8 * 16(%%rax), %%xmm0 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "movaps -7 * 16(%%rax), %%xmm1 \n\t" + " \n\t" + "prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next[0] + "prefetcht2 16 * 4(%%r9) \n\t" // prefetch b_next[16] + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".SCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".SLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" // iteration 0 + "addps %%xmm3, %%xmm14 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "pshufd $0x39, %%xmm2, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + "movaps %%xmm7, %%xmm5 \n\t" + "pshufd $0x39, %%xmm7, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm7 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + " \n\t" + "addps %%xmm2, %%xmm8 \n\t" + "movaps -7 * 16(%%rbx), %%xmm2 \n\t" + "addps %%xmm3, %%xmm12 \n\t" + "movaps %%xmm6, %%xmm3 \n\t" + "pshufd $0x39, %%xmm6, %%xmm4 \n\t" + "mulps %%xmm0, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + " \n\t" + "addps %%xmm7, %%xmm9 \n\t" + "addps %%xmm5, %%xmm13 \n\t" + "movaps %%xmm4, %%xmm5 \n\t" + "mulps %%xmm0, %%xmm4 \n\t" + "movaps -6 * 16(%%rax), %%xmm0 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "movaps -5 * 16(%%rax), %%xmm1 \n\t" + " \n\t" + "subq $-1 * 8 * 4, %%rax \n\t" // a += 8 (1 x mr) + "subq $-1 * 4 * 4, %%rbx \n\t" // b += 4 (1 x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".SPOSTACCUM: \n\t" + " \n\t" + "addps %%xmm6, %%xmm10 \n\t" + "addps %%xmm3, %%xmm14 \n\t" + "addps %%xmm4, %%xmm11 \n\t" + "addps %%xmm5, %%xmm15 \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "movss (%%rax), %%xmm6 \n\t" // load alpha to bottom 4 bytes of xmm6 + "movss (%%rbx), %%xmm7 \n\t" // load beta to bottom 4 bytes of xmm7 + "pshufd $0x00, %%xmm6, %%xmm6 \n\t" // populate xmm6 with four alphas + "pshufd $0x00, %%xmm7, %%xmm7 \n\t" // populate xmm7 with four betas + " \n\t" + " \n\t" + "movq %7, %%rsi \n\t" // load rs_c + "movq %%rsi, %%r8 \n\t" // make a copy of rs_c + " \n\t" + "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) + "leaq (%%rsi,%%rsi,2), %%r11 \n\t" // r11 = 3*(rs_c * sizeof(float)) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; + " \n\t" + " \n\t" // xmm8: xmm9: xmm10: xmm11: + " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 + " \n\t" // ab11 ab12 ab13 ab10 + " \n\t" // ab22 ab23 ab20 ab21 + " \n\t" // ab33 ) ab30 ) ab31 ) ab32 ) + " \n\t" // + " \n\t" // xmm12: xmm13: xmm14: xmm15: + " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 + " \n\t" // ab51 ab52 ab53 ab50 + " \n\t" // ab62 ab63 ab60 ab61 + " \n\t" // ab73 ) ab70 ) ab71 ) ab72 ) + "movaps %%xmm9, %%xmm4 \n\t" + "shufps $0xd8, %%xmm8, %%xmm9 \n\t" + "shufps $0xd8, %%xmm11, %%xmm8 \n\t" + "shufps $0xd8, %%xmm10, %%xmm11\n\t" + "shufps $0xd8, %%xmm4, %%xmm10\n\t" + " \n\t" + "movaps %%xmm8, %%xmm4 \n\t" + "shufps $0xd8, %%xmm10, %%xmm8 \n\t" + "shufps $0xd8, %%xmm4, %%xmm10 \n\t" + "movaps %%xmm9, %%xmm5 \n\t" + "shufps $0xd8, %%xmm11, %%xmm9 \n\t" + "shufps $0xd8, %%xmm5, %%xmm11 \n\t" + " \n\t" + "movaps %%xmm13, %%xmm4 \n\t" + "shufps $0xd8, %%xmm12, %%xmm13\n\t" + "shufps $0xd8, %%xmm15, %%xmm12\n\t" + "shufps $0xd8, %%xmm14, %%xmm15\n\t" + "shufps $0xd8, %%xmm4, %%xmm14\n\t" + " \n\t" + "movaps %%xmm12, %%xmm4 \n\t" + "shufps $0xd8, %%xmm14, %%xmm12\n\t" + "shufps $0xd8, %%xmm4, %%xmm14 \n\t" + "movaps %%xmm13, %%xmm5 \n\t" + "shufps $0xd8, %%xmm15, %%xmm13\n\t" + "shufps $0xd8, %%xmm5, %%xmm15 \n\t" + " \n\t" // xmm8: xmm9: xmm10: xmm11: + " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 + " \n\t" // ab10 ab11 ab12 ab13 + " \n\t" // ab20 ab21 ab22 ab23 + " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) + " \n\t" // + " \n\t" // xmm12: xmm13: xmm14: xmm15: + " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 + " \n\t" // ab50 ab51 ab52 ab53 + " \n\t" // ab60 ab61 ab62 ab63 + " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 16 == 0, AND + " \n\t" // rs_c == 1 + " \n\t" // ie: aligned and column-stored + " \n\t" + "cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $15, %%rcx \n\t" // set ZF if c & 16 is zero. + "setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 ); + " \n\t" // and(bl,bh) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. + "ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. + "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "jne .SCOLSTORED \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".SGENSTORED: \n\t" + " \n\t" + "movlps (%%rcx ), %%xmm0 \n\t" // load c00 ~ c30 + "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm8, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "movlps (%%rdx ), %%xmm0 \n\t" // load c40 ~ c70 + "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm12, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "movlps (%%rcx ), %%xmm0 \n\t" // load c01 ~ c31 + "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm9, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "movlps (%%rdx ), %%xmm0 \n\t" // load c41 ~ c71 + "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm13, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "movlps (%%rcx ), %%xmm0 \n\t" // load c02 ~ c32 + "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm10, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "movlps (%%rdx ), %%xmm0 \n\t" // load c42 ~ c72 + "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm14, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "movlps (%%rcx ), %%xmm0 \n\t" // load c03 ~ c33 + "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm11, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movlps (%%rdx ), %%xmm0 \n\t" // load c43 ~ c73 + "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" + "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" + "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" + "shufps $0x88, %%xmm1, %%xmm0 \n\t" + " \n\t" + "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm15, %%xmm0 \n\t" // add the gemm result, + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SCOLSTORED: \n\t" + " \n\t" + "movaps (%%rcx), %%xmm0 \n\t" // load c00 ~ c30, + "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm8, %%xmm0 \n\t" // add the gemm result, + "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" + "movaps (%%rdx), %%xmm1 \n\t" // load c40 ~ c70, + "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, + "addps %%xmm12, %%xmm1 \n\t" // add the gemm result, + "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + "movaps (%%rcx), %%xmm0 \n\t" // load c01 ~ c31, + "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm9, %%xmm0 \n\t" // add the gemm result, + "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" + "movaps (%%rdx), %%xmm1 \n\t" // load c41 ~ c71, + "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, + "addps %%xmm13, %%xmm1 \n\t" // add the gemm result, + "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + "movaps (%%rcx), %%xmm0 \n\t" // load c02 ~ c32, + "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm10, %%xmm0 \n\t" // add the gemm result, + "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" + "movaps (%%rdx), %%xmm1 \n\t" // load c42 ~ c72, + "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, + "addps %%xmm14, %%xmm1 \n\t" // add the gemm result, + "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + "movaps (%%rcx), %%xmm0 \n\t" // load c03 ~ c33, + "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, + "addps %%xmm11, %%xmm0 \n\t" // add the gemm result, + "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. + " \n\t" + " \n\t" + "movaps (%%rdx), %%xmm1 \n\t" // load c43 ~ c73, + "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, + "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, + "addps %%xmm15, %%xmm1 \n\t" // add the gemm result, + "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SBETAZERO: \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "jne .SCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".SGENSTORBZ: \n\t" + " \n\t" + "jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now + //"jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SCOLSTORBZ: \n\t" + " \n\t" + " \n\t" // skip loading c00 ~ c30, + "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, + "movaps %%xmm8, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" // skip loading c40 ~ c70, + "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, + "movaps %%xmm12, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" // skip loading c01 ~ c31, + "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, + "movaps %%xmm9, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" // skip loading c41 ~ c71, + "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, + "movaps %%xmm13, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" // skip loading c02 ~ c32, + "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, + "movaps %%xmm10, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" + " \n\t" // skip loading c42 ~ c72, + "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, + "movaps %%xmm14, (%%rdx) \n\t" // and store back to memory. + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" // skip loading c03 ~ c33, + "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, + "movaps %%xmm11, (%%rcx) \n\t" // and store back to memory. + " \n\t" + " \n\t" // skip loading c43 ~ c73, + "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, + "movaps %%xmm15, (%%rdx) \n\t" // and store back to memory. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), + "m" (k_left), + "m" (a), + "m" (b), + "m" (alpha), + "m" (beta), + "m" (c), + "m" (rs_c), + "m" (cs_c), + "m" (b_next) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } void bli_dgemm_opt_d4x4( @@ -109,11 +777,11 @@ void bli_dgemm_opt_d4x4( " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" - ".LOOPKITER: \n\t" // MAIN LOOP + ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" "prefetcht0 (4*35+1) * 8(%%rax) \n\t" " \n\t" @@ -252,19 +920,19 @@ void bli_dgemm_opt_d4x4( " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKITER \n\t" // iterate again if i != 0. + "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" - ".CONSIDERKLEFT: \n\t" + ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" - ".LOOPKLEFT: \n\t" // EDGE LOOP + ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 "movaps -7 * 16(%%rbx), %%xmm3 \n\t" @@ -302,11 +970,11 @@ void bli_dgemm_opt_d4x4( " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKLEFT \n\t" // iterate again if i != 0. + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" - ".POSTACCUM: \n\t" + ".DPOSTACCUM: \n\t" " \n\t" "addpd %%xmm3, %%xmm11 \n\t" "addpd %%xmm4, %%xmm15 \n\t" @@ -322,7 +990,8 @@ void bli_dgemm_opt_d4x4( " \n\t" "movq %7, %%rsi \n\t" // load rs_c "movq %%rsi, %%r8 \n\t" // make a copy of rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rs_c *= sizeof(double) + " \n\t" + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) " \n\t" "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; " \n\t" @@ -373,16 +1042,16 @@ void bli_dgemm_opt_d4x4( " \n\t" "xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. "ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. - "je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "jne .COLSTORED \n\t" // jump to column storage case + "jne .DCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" - ".GENSTORED: \n\t" + ".DGENSTORED: \n\t" " \n\t" "movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10, "movhpd (%%rcx,%%rsi), %%xmm0 \n\t" @@ -461,11 +1130,11 @@ void bli_dgemm_opt_d4x4( "movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory. "movhpd %%xmm1, (%%rdx,%%rsi) \n\t" " \n\t" - "jmp .DONE \n\t" // jump to end. + "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" - ".COLSTORED: \n\t" + ".DCOLSTORED: \n\t" " \n\t" "movaps (%%rcx), %%xmm0 \n\t" // load c00 and c10, "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, @@ -528,19 +1197,19 @@ void bli_dgemm_opt_d4x4( "addpd %%xmm15, %%xmm1 \n\t" // add the gemm result, "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. " \n\t" - "jmp .DONE \n\t" // jump to end. + "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" " \n\t" - ".BETAZERO: \n\t" + ".DBETAZERO: \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "jne .COLSTORBZ \n\t" // jump to column storage case + "jne .DCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" - ".GENSTORBZ: \n\t" + ".DGENSTORBZ: \n\t" " \n\t" // skip loading c00 and c10, "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, "movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory. @@ -587,11 +1256,11 @@ void bli_dgemm_opt_d4x4( "movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory. "movhpd %%xmm15, (%%rdx,%%rsi) \n\t" " \n\t" - "jmp .DONE \n\t" // jump to end. + "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" - ".COLSTORBZ: \n\t" + ".DCOLSTORBZ: \n\t" " \n\t" " \n\t" // skip loading c00 and c10, "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, @@ -638,7 +1307,7 @@ void bli_dgemm_opt_d4x4( " \n\t" " \n\t" " \n\t" - ".DONE: \n\t" + ".DDONE: \n\t" " \n\t" : // output operands (none) @@ -661,7 +1330,6 @@ void bli_dgemm_opt_d4x4( "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); - } void bli_cgemm_opt_d4x4(