mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
CPUPL-1074:
- Bug fix in sgemmsup 1x16 Kernel for Beta Zero and with C col storage
rcx register incrementing was missing because of this 4 values
in output are overwritten
Change-Id: Ia3028040dce3e615f1db5a331498d86faadcf916
This commit is contained in:
committed by
Nallani Bhaskar
parent
7bbcae5a18
commit
d186cfdf2e
@@ -2479,6 +2479,7 @@ void bli_sgemmsup_rv_zen_asm_1x16
|
||||
vmovss(xmm1, mem(rcx, rsi, 1))
|
||||
vmovss(xmm2, mem(rcx, rsi, 2))
|
||||
vmovss(xmm14, mem(rcx, rax, 1))
|
||||
lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c
|
||||
vextractf128(imm(0x0), ymm5, xmm0)//c0-c3
|
||||
vshufps(imm(0x01), xmm0, xmm0,xmm1)
|
||||
vshufps(imm(0x02), xmm0, xmm0,xmm2)
|
||||
|
||||
Reference in New Issue
Block a user