diff --git a/config/sandybridge/bli_kernel.h b/config/sandybridge/bli_kernel.h index 33f7fcaf2..927df2c32 100644 --- a/config/sandybridge/bli_kernel.h +++ b/config/sandybridge/bli_kernel.h @@ -154,9 +154,10 @@ #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 -//#define BLIS_DGEMM_UKERNEL bli_dgemm_int_8x4 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 +#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 + #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 // -- trsm-related -- diff --git a/kernels/x86_64/avx/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/avx/3/bli_gemm_asm_d8x4.c index 53f6c4614..c885ceb51 100644 --- a/kernels/x86_64/avx/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/avx/3/bli_gemm_asm_d8x4.c @@ -447,26 +447,6 @@ void bli_sgemm_asm_8x8( "jne .SCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab20 ab22 ab24 ab26 - " \n\t" // ab30 ab32 ab34 ab36 - " \n\t" // ab40 ab42 ab44 ab46 - " \n\t" // ab50 ab52 ab54 ab56 - " \n\t" // ab60 ab62 ab64 ab66 - " \n\t" // ab70 ) ab72 ) ab74 ) ab76 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab21 ab23 ab25 ab27 - " \n\t" // ab31 ab33 ab35 ab37 - " \n\t" // ab41 ab43 ab45 ab47 - " \n\t" // ab51 ab53 ab55 ab57 - " \n\t" // ab61 ab63 ab65 ab67 - " \n\t" // ab71 ) ab73 ) ab75 ) ab77 ) - " \n\t" " \n\t" ".SGENSTORED: \n\t" " \n\t" @@ -1757,14 +1737,962 @@ void bli_cgemm_asm_8x4( auxinfo_t* data ) { - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + dim_t k_iter = k / 4; + dim_t k_left = k % 4; + + __asm__ volatile + ( + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + //"movq %10, %%r14 \n\t" // load address of a_next. + " \n\t" + "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading + "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %8, %%rdi \n\t" // load cs_c + "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) + "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; + " \n\t" + "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c + "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c + "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c + " \n\t" + "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".CLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 8 * 32(%%rax) \n\t" + "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" + "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 1 + "prefetcht0 10 * 32(%%rax) \n\t" + "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovaps 4 * 32(%%rax), %%ymm0 \n\t" + "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 12 * 32(%%rax) \n\t" + "vmovaps 5 * 32(%%rax), %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovaps 6 * 32(%%rax), %%ymm0 \n\t" + "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 3 + "prefetcht0 14 * 32(%%rax) \n\t" + "vmovaps 7 * 32(%%rax), %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovsldup 4 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovaps 8 * 32(%%rax), %%ymm0 \n\t" + "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + "addq $8 * 4 * 8, %%rax \n\t" // a += 8*4 (unroll x mr) + "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".CLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 8 * 32(%%rax) \n\t" + "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" + "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" + "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) + "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".CPOSTACCUM: \n\t" + " \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 + " \n\t" // ab10 ab11 ab12 ab13 + " \n\t" // ab21 ab20 ab23 ab22 + " \n\t" // ab31 ab30 ab33 ab32 + " \n\t" // ab42 ab43 ab40 ab41 + " \n\t" // ab52 ab53 ab50 ab51 + " \n\t" // ab63 ab62 ab61 ab60 + " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 + " \n\t" // ab90 ab91 ab92 ab93 + " \n\t" // aba1 aba0 aba3 aba2 + " \n\t" // abb1 abb0 abb3 abb2 + " \n\t" // abc2 abc3 abc0 abc1 + " \n\t" // abd2 abd3 abd0 abd1 + " \n\t" // abe3 abe2 abe1 abe0 + " \n\t" // abf3 abf2 abf1 abf0 ) + " \n\t" + "vmovaps %%ymm15, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmovaps %%ymm11, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmovaps %%ymm10, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 + " \n\t" // ab10 ab11 ab12 ab13 + " \n\t" // ab20 ab21 ab22 ab23 + " \n\t" // ab30 ab31 ab32 ab33 + " \n\t" // ab42 ab43 ab40 ab41 + " \n\t" // ab52 ab53 ab50 ab51 + " \n\t" // ab62 ab63 ab60 ab61 + " \n\t" // ab72 ) ab73 ) ab70 ) ab71 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 + " \n\t" // ab90 ab91 ab92 ab93 + " \n\t" // aba0 aba1 aba2 aba3 + " \n\t" // abb0 abb1 abb2 abb3 + " \n\t" // abc2 abc3 abc0 abc1 + " \n\t" // abd2 abd3 abd0 abd1 + " \n\t" // abe2 abe3 abe0 abe1 + " \n\t" // abf2 ) abf3 ) abf0 ) abf1 ) + " \n\t" + "vmovaps %%ymm15, %%ymm7 \n\t" + "vperm2f128 $0x12, %%ymm15, %%ymm11, %%ymm15 \n\t" + "vperm2f128 $0x30, %%ymm7, %%ymm11, %%ymm11 \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm7 \n\t" + "vperm2f128 $0x12, %%ymm13, %%ymm9, %%ymm13 \n\t" + "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm7 \n\t" + "vperm2f128 $0x12, %%ymm14, %%ymm10, %%ymm14 \n\t" + "vperm2f128 $0x30, %%ymm7, %%ymm10, %%ymm10 \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm7 \n\t" + "vperm2f128 $0x12, %%ymm12, %%ymm8, %%ymm12 \n\t" + "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 + " \n\t" // ab10 ab11 ab12 ab13 + " \n\t" // ab20 ab21 ab22 ab23 + " \n\t" // ab30 ab31 ab32 ab33 + " \n\t" // ab40 ab41 ab42 ab43 + " \n\t" // ab50 ab51 ab52 ab53 + " \n\t" // ab60 ab61 ab62 ab63 + " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 + " \n\t" // ab90 ab91 ab92 ab93 + " \n\t" // aba0 aba1 aba2 aba3 + " \n\t" // abb0 abb1 abb2 abb3 + " \n\t" // abc0 abc1 abc2 abc3 + " \n\t" // abd0 abd1 abd2 abd3 + " \n\t" // abe0 abe1 abe2 abe3 + " \n\t" // abf0 ) abf1 ) abf2 ) abf3 ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // scale by alpha + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastss (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate + "vbroadcastss 4(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate + " \n\t" + "vpermilps $0xb1, %%ymm15, %%ymm3 \n\t" + "vmulps %%ymm7, %%ymm15, %%ymm15 \n\t" + "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm15, %%ymm15 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm14, %%ymm2 \n\t" + "vmulps %%ymm7, %%ymm14, %%ymm14 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm14, %%ymm14 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm1 \n\t" + "vmulps %%ymm7, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" + "vaddsubps %%ymm1, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm0 \n\t" + "vmulps %%ymm7, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" + "vaddsubps %%ymm0, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm3 \n\t" + "vmulps %%ymm7, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm11, %%ymm11 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm10, %%ymm2 \n\t" + "vmulps %%ymm7, %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm10, %%ymm10 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm9, %%ymm1 \n\t" + "vmulps %%ymm7, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" + "vaddsubps %%ymm1, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm8, %%ymm0 \n\t" + "vmulps %%ymm7, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" + "vaddsubps %%ymm0, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate + "vbroadcastss 4(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %7, %%rsi \n\t" // load rs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; + " \n\t" + "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; + "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*cs_c % 32 == 0, AND + " \n\t" // rs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // column-stored + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomiss %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .CCOLSTORED \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORED: \n\t" + " \n\t" + " \n\t" // update c00:c70 + " \n\t" + "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c00,10) into xmm0[0:1] + "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c20,30) into xmm0[2:3] + "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c40,50) into xmm2[0:1] + "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c60,70) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c00,c10) + "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c20,c30) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c80:cf0 + " \n\t" + "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c80,90) into xmm0[0:1] + "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca0,b0) into xmm0[2:3] + "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc0,d0) into xmm2[0:1] + "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce0,f0) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c80,c90) + "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c01:c71 + " \n\t" + "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c01,11) into xmm0[0:1] + "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c21,31) into xmm0[2:3] + "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c41,51) into xmm2[0:1] + "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c61,71) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c01,c11) + "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c21,c31) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c81:cf1 + " \n\t" + "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c81,91) into xmm0[0:1] + "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca1,b1) into xmm0[2:3] + "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc1,d1) into xmm2[0:1] + "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce1,f1) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c81,c91) + "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c02:c72 + " \n\t" + "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c02,12) into xmm0[0:1] + "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c22,32) into xmm0[2:3] + "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c42,52) into xmm2[0:1] + "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c62,72) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c02,c12) + "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c22,c32) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c82:cf2 + " \n\t" + "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c82,92) into xmm0[0:1] + "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca2,b2) into xmm0[2:3] + "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc2,d2) into xmm2[0:1] + "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce2,f2) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c82,c92) + "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c03:c73 + " \n\t" + "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c03,13) into xmm0[0:1] + "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c23,33) into xmm0[2:3] + "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c43,53) into xmm2[0:1] + "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c63,73) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c03,c13) + "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c23,c33) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c83:cf3 + " \n\t" + "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c83,93) into xmm0[0:1] + "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca3,b3) into xmm0[2:3] + "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc3,d3) into xmm2[0:1] + "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce3,f3) into xmm2[2:3] + "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c83,c93) + "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CCOLSTORED: \n\t" + " \n\t" + " \n\t" // update c00:c70 + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rcx) \n\t" // store c00:c70 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c80:cf0 + " \n\t" + "vmovaps (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rdx) \n\t" // store c80:cf0 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c00:c70 + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rcx) \n\t" // store c01:c71 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c81:cf1 + " \n\t" + "vmovaps (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rdx) \n\t" // store c81:cf1 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c02:c72 + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rcx) \n\t" // store c02:c72 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c82:cf2 + " \n\t" + "vmovaps (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rdx) \n\t" // store c82:cf2 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c03:c73 + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rcx) \n\t" // store c03:c73 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c83:cf3 + " \n\t" + "vmovaps (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 + "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta + "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" + "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 + "vmovaps %%ymm0, (%%rdx) \n\t" // store c83:cf3 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CBETAZERO: \n\t" + " \n\t" // check if aligned/column-stored + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .CCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORBZ: \n\t" + " \n\t" + " \n\t" // update c00:c70 + " \n\t" + "vextractf128 $1, %%ymm15, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm15, (%%rcx) \n\t" // store (c00,c10) + "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t" // store (c20,c30) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c80:cf0 + " \n\t" + "vextractf128 $1, %%ymm14, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm14, (%%rdx) \n\t" // store (c80,c90) + "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c01:c71 + " \n\t" + "vextractf128 $1, %%ymm13, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm13, (%%rcx) \n\t" // store (c01,c11) + "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t" // store (c21,c31) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c81:cf1 + " \n\t" + "vextractf128 $1, %%ymm12, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm12, (%%rdx) \n\t" // store (c81,c91) + "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c02:c72 + " \n\t" + "vextractf128 $1, %%ymm11, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm11, (%%rcx) \n\t" // store (c02,c12) + "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t" // store (c22,c32) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c82:cf2 + " \n\t" + "vextractf128 $1, %%ymm10, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm10, (%%rdx) \n\t" // store (c82,c92) + "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c03:c73 + " \n\t" + "vextractf128 $1, %%ymm9, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm9, (%%rcx) \n\t" // store (c03,c13) + "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t" // store (c23,c33) + "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) + "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c83:cf3 + " \n\t" + "vextractf128 $1, %%ymm8, %%xmm2 \n\t" // xmm2 := ymm0[4:7] + "vmovlpd %%xmm8, (%%rdx) \n\t" // store (c83,c93) + "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) + "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) + "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CCOLSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm15, (%%rcx) \n\t" // store c00:c70 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm14, (%%rdx) \n\t" // store c80:cf0 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm13, (%%rcx) \n\t" // store c01:c71 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm12, (%%rdx) \n\t" // store c81:cf1 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm11, (%%rcx) \n\t" // store c02:c72 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm10, (%%rdx) \n\t" // store c82:cf2 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm9, (%%rcx) \n\t" // store c03:c73 + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm8, (%%rdx) \n\t" // store c83:cf3 + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } @@ -1779,8 +2707,8 @@ void bli_zgemm_asm_4x4( auxinfo_t* data ) { - void* a_next = bli_auxinfo_next_a( data ); - void* b_next = bli_auxinfo_next_b( data ); + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); dim_t k_iter = k / 4; dim_t k_left = k % 4; @@ -1791,7 +2719,7 @@ void bli_zgemm_asm_4x4( " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. + //"movq %9, %%r15 \n\t" // load address of b_next. //"movq %10, %%r14 \n\t" // load address of a_next. " \n\t" "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading @@ -1800,8 +2728,8 @@ void bli_zgemm_asm_4x4( " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" - "leaq (,%%rdi,2), %%rdi \n\t" // cs_c *= sizeof(dcomplex) + "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) + "leaq (,%%rdi,2), %%rdi \n\t" "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; " \n\t" "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c @@ -2248,6 +3176,7 @@ void bli_zgemm_asm_4x4( " \n\t" " \n\t" " \n\t" + " \n\t" " \n\t" // determine if " \n\t" // c % 32 == 0, AND " \n\t" // 16*cs_c % 32 == 0, AND @@ -2616,9 +3545,9 @@ void bli_zgemm_asm_4x4( "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 - "m" (cs_c), // 8 + "m" (cs_c)/*, // 8 "m" (b_next), // 9 - "m" (a_next) // 10 + "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",