Fixed unimplemented case in core2 sgemm ukernel.

Details:
- Implemented the "beta == 0" case for general stride output for the
  dunnington sgemm micro-kernel. This case had been, up until now,
  identical to the "beta != 0" case, which does not work when the
  output matrix has nan's and inf's. It had manifested as nan residuals
  in the test suite for right-side tests of ctrsm4m1a. Thanks to Devin
  Matthews for reporting this bug.
This commit is contained in:
Field G. Van Zee
2015-11-12 15:22:50 -06:00
parent 42810bbfa0
commit f0a4f41b5a

View File

@@ -73,23 +73,23 @@ void bli_sgemm_opt_8x4(
" \n\t"
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
"xorps %%xmm3, %%xmm3 \n\t"
"xorps %%xmm4, %%xmm4 \n\t"
"xorps %%xmm5, %%xmm5 \n\t"
"xorps %%xmm6, %%xmm6 \n\t"
" \n\t"
"prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
"xorpd %%xmm8, %%xmm8 \n\t"
"movaps %%xmm8, %%xmm9 \n\t"
"prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"movaps %%xmm8, %%xmm10 \n\t"
"movaps %%xmm8, %%xmm11 \n\t"
"prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
"movaps %%xmm8, %%xmm12 \n\t"
"movaps %%xmm8, %%xmm13 \n\t"
"prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
"movaps %%xmm8, %%xmm14 \n\t"
"movaps %%xmm8, %%xmm15 \n\t"
"prefetcht2 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
"xorps %%xmm8, %%xmm8 \n\t"
"xorps %%xmm9, %%xmm9 \n\t"
"prefetcht2 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"xorps %%xmm10, %%xmm10 \n\t"
"xorps %%xmm11, %%xmm11 \n\t"
"prefetcht2 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
"xorps %%xmm12, %%xmm12 \n\t"
"xorps %%xmm13, %%xmm13 \n\t"
"prefetcht2 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
"xorps %%xmm14, %%xmm14 \n\t"
"xorps %%xmm15, %%xmm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
@@ -646,8 +646,119 @@ void bli_sgemm_opt_8x4(
" \n\t"
".SGENSTORBZ: \n\t"
" \n\t"
"jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now
//"jmp .SDONE \n\t" // jump to end.
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
"movaps %%xmm8, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
"movaps %%xmm12, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
"movaps %%xmm9, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
"movaps %%xmm13, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
"movaps %%xmm10, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
"movaps %%xmm14, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
"movaps %%xmm11, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
"movaps %%xmm15, %%xmm0 \n\t"
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
@@ -770,16 +881,16 @@ void bli_dgemm_opt_4x4(
" \n\t"
"prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
"xorpd %%xmm8, %%xmm8 \n\t"
"movaps %%xmm8, %%xmm9 \n\t"
"xorpd %%xmm9, %%xmm9 \n\t"
"prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"movaps %%xmm8, %%xmm10 \n\t"
"movaps %%xmm8, %%xmm11 \n\t"
"xorpd %%xmm10, %%xmm10 \n\t"
"xorpd %%xmm11, %%xmm11 \n\t"
"prefetcht2 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c
"movaps %%xmm8, %%xmm12 \n\t"
"movaps %%xmm8, %%xmm13 \n\t"
"xorpd %%xmm12, %%xmm12 \n\t"
"xorpd %%xmm13, %%xmm13 \n\t"
"prefetcht2 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
"movaps %%xmm8, %%xmm14 \n\t"
"movaps %%xmm8, %%xmm15 \n\t"
"xorpd %%xmm14, %%xmm14 \n\t"
"xorpd %%xmm15, %%xmm15 \n\t"
" \n\t"
" \n\t"
" \n\t"