diff --git a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c index 8b2c3974f..b4dfb1ce9 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c @@ -73,23 +73,23 @@ void bli_sgemm_opt_8x4( " \n\t" "prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next " \n\t" - "xorpd %%xmm3, %%xmm3 \n\t" - "xorpd %%xmm4, %%xmm4 \n\t" - "xorpd %%xmm5, %%xmm5 \n\t" - "xorpd %%xmm6, %%xmm6 \n\t" + "xorps %%xmm3, %%xmm3 \n\t" + "xorps %%xmm4, %%xmm4 \n\t" + "xorps %%xmm5, %%xmm5 \n\t" + "xorps %%xmm6, %%xmm6 \n\t" " \n\t" - "prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c - "xorpd %%xmm8, %%xmm8 \n\t" - "movaps %%xmm8, %%xmm9 \n\t" - "prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "movaps %%xmm8, %%xmm10 \n\t" - "movaps %%xmm8, %%xmm11 \n\t" - "prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c - "movaps %%xmm8, %%xmm12 \n\t" - "movaps %%xmm8, %%xmm13 \n\t" - "prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - "movaps %%xmm8, %%xmm14 \n\t" - "movaps %%xmm8, %%xmm15 \n\t" + "prefetcht2 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c + "xorps %%xmm8, %%xmm8 \n\t" + "xorps %%xmm9, %%xmm9 \n\t" + "prefetcht2 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "xorps %%xmm10, %%xmm10 \n\t" + "xorps %%xmm11, %%xmm11 \n\t" + "prefetcht2 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c + "xorps %%xmm12, %%xmm12 \n\t" + "xorps %%xmm13, %%xmm13 \n\t" + "prefetcht2 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c + "xorps %%xmm14, %%xmm14 \n\t" + "xorps %%xmm15, %%xmm15 \n\t" " \n\t" " \n\t" " \n\t" @@ -646,8 +646,119 @@ void bli_sgemm_opt_8x4( " \n\t" ".SGENSTORBZ: \n\t" " \n\t" - "jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now - //"jmp .SDONE \n\t" // jump to end. + "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, + "movaps %%xmm8, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, + "movaps %%xmm12, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, + "movaps %%xmm9, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, + "movaps %%xmm13, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, + "movaps %%xmm10, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, + "movaps %%xmm14, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, + "movaps %%xmm11, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rcx,%%r11 ) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, + "movaps %%xmm15, %%xmm0 \n\t" + " \n\t" + "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. + "pshufd $0x39, %%xmm0, %%xmm1 \n\t" + "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" + "pshufd $0x39, %%xmm1, %%xmm2 \n\t" + "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" + "pshufd $0x39, %%xmm2, %%xmm3 \n\t" + "movss %%xmm3, (%%rdx,%%r11 ) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" @@ -770,16 +881,16 @@ void bli_dgemm_opt_4x4( " \n\t" "prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "xorpd %%xmm8, %%xmm8 \n\t" - "movaps %%xmm8, %%xmm9 \n\t" + "xorpd %%xmm9, %%xmm9 \n\t" "prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "movaps %%xmm8, %%xmm10 \n\t" - "movaps %%xmm8, %%xmm11 \n\t" + "xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11 \n\t" "prefetcht2 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "movaps %%xmm8, %%xmm12 \n\t" - "movaps %%xmm8, %%xmm13 \n\t" + "xorpd %%xmm12, %%xmm12 \n\t" + "xorpd %%xmm13, %%xmm13 \n\t" "prefetcht2 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - "movaps %%xmm8, %%xmm14 \n\t" - "movaps %%xmm8, %%xmm15 \n\t" + "xorpd %%xmm14, %%xmm14 \n\t" + "xorpd %%xmm15, %%xmm15 \n\t" " \n\t" " \n\t" " \n\t"