mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed unimplemented case in core2 sgemm ukernel.
Details: - Implemented the "beta == 0" case for general stride output for the dunnington sgemm micro-kernel. This case had been, up until now, identical to the "beta != 0" case, which does not work when the output matrix has nan's and inf's. It had manifested as nan residuals in the test suite for right-side tests of ctrsm4m1a. Thanks to Devin Matthews for reporting this bug.
This commit is contained in:
@@ -73,23 +73,23 @@ void bli_sgemm_opt_8x4(
|
||||
" \n\t"
|
||||
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next
|
||||
" \n\t"
|
||||
"xorpd %%xmm3, %%xmm3 \n\t"
|
||||
"xorpd %%xmm4, %%xmm4 \n\t"
|
||||
"xorpd %%xmm5, %%xmm5 \n\t"
|
||||
"xorpd %%xmm6, %%xmm6 \n\t"
|
||||
"xorps %%xmm3, %%xmm3 \n\t"
|
||||
"xorps %%xmm4, %%xmm4 \n\t"
|
||||
"xorps %%xmm5, %%xmm5 \n\t"
|
||||
"xorps %%xmm6, %%xmm6 \n\t"
|
||||
" \n\t"
|
||||
"prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
|
||||
"xorpd %%xmm8, %%xmm8 \n\t"
|
||||
"movaps %%xmm8, %%xmm9 \n\t"
|
||||
"prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
|
||||
"movaps %%xmm8, %%xmm10 \n\t"
|
||||
"movaps %%xmm8, %%xmm11 \n\t"
|
||||
"prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
|
||||
"movaps %%xmm8, %%xmm12 \n\t"
|
||||
"movaps %%xmm8, %%xmm13 \n\t"
|
||||
"prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
|
||||
"movaps %%xmm8, %%xmm14 \n\t"
|
||||
"movaps %%xmm8, %%xmm15 \n\t"
|
||||
"prefetcht2 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
|
||||
"xorps %%xmm8, %%xmm8 \n\t"
|
||||
"xorps %%xmm9, %%xmm9 \n\t"
|
||||
"prefetcht2 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
|
||||
"xorps %%xmm10, %%xmm10 \n\t"
|
||||
"xorps %%xmm11, %%xmm11 \n\t"
|
||||
"prefetcht2 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
|
||||
"xorps %%xmm12, %%xmm12 \n\t"
|
||||
"xorps %%xmm13, %%xmm13 \n\t"
|
||||
"prefetcht2 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
|
||||
"xorps %%xmm14, %%xmm14 \n\t"
|
||||
"xorps %%xmm15, %%xmm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -646,8 +646,119 @@ void bli_sgemm_opt_8x4(
|
||||
" \n\t"
|
||||
".SGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
"jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now
|
||||
//"jmp .SDONE \n\t" // jump to end.
|
||||
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"movaps %%xmm8, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"movaps %%xmm12, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"movaps %%xmm9, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"movaps %%xmm13, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"movaps %%xmm10, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"movaps %%xmm14, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"movaps %%xmm11, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"movaps %%xmm15, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -770,16 +881,16 @@ void bli_dgemm_opt_4x4(
|
||||
" \n\t"
|
||||
"prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
|
||||
"xorpd %%xmm8, %%xmm8 \n\t"
|
||||
"movaps %%xmm8, %%xmm9 \n\t"
|
||||
"xorpd %%xmm9, %%xmm9 \n\t"
|
||||
"prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
|
||||
"movaps %%xmm8, %%xmm10 \n\t"
|
||||
"movaps %%xmm8, %%xmm11 \n\t"
|
||||
"xorpd %%xmm10, %%xmm10 \n\t"
|
||||
"xorpd %%xmm11, %%xmm11 \n\t"
|
||||
"prefetcht2 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c
|
||||
"movaps %%xmm8, %%xmm12 \n\t"
|
||||
"movaps %%xmm8, %%xmm13 \n\t"
|
||||
"xorpd %%xmm12, %%xmm12 \n\t"
|
||||
"xorpd %%xmm13, %%xmm13 \n\t"
|
||||
"prefetcht2 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
|
||||
"movaps %%xmm8, %%xmm14 \n\t"
|
||||
"movaps %%xmm8, %%xmm15 \n\t"
|
||||
"xorpd %%xmm14, %%xmm14 \n\t"
|
||||
"xorpd %%xmm15, %%xmm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
|
||||
Reference in New Issue
Block a user