diff --git a/README.md b/README.md index 7142a1329..9fd5c6146 100644 --- a/README.md +++ b/README.md @@ -262,11 +262,17 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an ``` @article{BLIS4, - author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and Enrique S. Quintana-Ort\'{\i}}, - title = {Analytical Models for the {BLIS} Framework}, + author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and + Enrique S. Quintana-Ort\'{\i}}, + title = {Analytical Modeling Is Enough for High-Performance {BLIS}}, journal = {ACM Transactions on Mathematical Software}, - year = 2016, - note = {Accepted}, + volume = {43}, + number = {2}, + pages = {12:1--12:18}, + month = aug, + year = {2016}, + issue_date = {August 2016}, + url = {http://doi.acm.org/10.1145/2925987}, } ``` diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 24a3c68b1..d23a00a5d 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -49,26 +49,27 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#if 0 +// sgemm micro-kernel +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4 #define BLIS_DEFAULT_MC_S 264 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 24 #define BLIS_DEFAULT_NR_S 4 +#endif -#else - -/* +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 -*/ +#endif +#if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 @@ -77,29 +78,29 @@ #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS - #endif -#if 0 +// dgemm micro-kernel +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 192 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 12 #define BLIS_DEFAULT_NR_D 4 +#endif -#else - -/* +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 -*/ +#endif +#if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 @@ -108,10 +109,33 @@ #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS - - #endif +// cgemm micro-kernel + +#if 1 +#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 +#define BLIS_DEFAULT_MC_C 144 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 4080 +#define BLIS_DEFAULT_MR_C 3 +#define BLIS_DEFAULT_NR_C 8 + +#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + +// zgemm micro-kernel + +#if 1 +#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 +#define BLIS_DEFAULT_MC_Z 72 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 4080 +#define BLIS_DEFAULT_MR_Z 3 +#define BLIS_DEFAULT_NR_Z 4 + +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index cb6097fe2..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -249,7 +249,7 @@ void bli_sgemm_asm_6x16 " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" + "prefetcht0 64 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" @@ -622,6 +622,8 @@ void bli_sgemm_asm_6x16 } + + #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ @@ -1197,9 +1199,41 @@ void bli_dgemm_asm_6x8 ); } -#if 0 -void bli_cgemm_asm_ + + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define CGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define CGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ + "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" + +#define CGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define CGEMM_OUTPUT_RS \ + "vmovups %%ymm0, (%%rcx) \n\t" \ + +void bli_cgemm_asm_3x8 ( dim_t k, scomplex* restrict alpha, @@ -1214,14 +1248,515 @@ void bli_cgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(scomplex) + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".CLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 8, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".CLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 8, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".CPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" + "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(scomplex) + "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*cs_c; + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CBETAZERO: \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -void bli_zgemm_asm_ + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovupd (%%rcx), %%xmm0 \n\t" \ + "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define ZGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovupd %%xmm0, (%%rcx) \n\t" \ + "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ + +#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define ZGEMM_OUTPUT_RS \ + "vmovupd %%ymm0, (%%rcx) \n\t" \ + +void bli_zgemm_asm_3x4 ( dim_t k, dcomplex* restrict alpha, @@ -1236,9 +1771,484 @@ void bli_zgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + //uint64_t alpha_is_unit = bli_zeq1( *alpha ); + + + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(dcomplex) + "leaq (,%%rdi,2), %%rdi \n\t" + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".ZPOSTACCUM: \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(dcomplex) + "leaq (,%%rsi,2), %%rsi \n\t" + "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZBETAZERO: \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -#endif