mirror of
https://github.com/amd/blis.git
synced 2026-05-13 02:25:39 +00:00
Optimized AVX2 DGEMM SUP and small edge kernels.
- Re-designed the new edge kernels that uses masked load-store instructions for handling corner cases. - Mask load-store instruction macros are added. vmovdqu, VMOVDQU for setting up the mask. vmaskmovpd, VMASKMOVPD for masked load-store - Following edge kernels are added for 6x8m dgemm sup. n-left edge kernels - bli_dgemmsup_rv_haswell_asm_6x7m - bli_dgemmsup_rv_haswell_asm_6x5m - bli_dgemmsup_rv_haswell_asm_6x3m m-left edge kernels - bli_dgemmsup_rv_haswell_asm_5x7 - bli_dgemmsup_rv_haswell_asm_4x7 - bli_dgemmsup_rv_haswell_asm_3x7 - bli_dgemmsup_rv_haswell_asm_2x7 - bli_dgemmsup_rv_haswell_asm_1x7 - bli_dgemmsup_rv_haswell_asm_5x5 - bli_dgemmsup_rv_haswell_asm_4x5 - bli_dgemmsup_rv_haswell_asm_3x5 - bli_dgemmsup_rv_haswell_asm_2x5 - bli_dgemmsup_rv_haswell_asm_1x5 - bli_dgemmsup_rv_haswell_asm_5x3 - bli_dgemmsup_rv_haswell_asm_4x3 - bli_dgemmsup_rv_haswell_asm_3x3 - bli_dgemmsup_rv_haswell_asm_2x3 - bli_dgemmsup_rv_haswell_asm_1x3 - For 16x3 dgemm_small, m_left computation is handled with masked load-store instructions avoid overhead of conditional checks for edge cases. - It improves performance by reducing branching overhead and by being more cache friendly. AMD-Internal: [CPUPL-3574] Change-Id: I976d6a9209d2a1a02b2830d03d21d200a5aad173
This commit is contained in:
@@ -776,6 +776,7 @@
|
||||
#define VMOVHPD(...) INSTR_(vmovhpd, __VA_ARGS__)
|
||||
#define VMOVDQA(_0, _1) INSTR_(vmovdqa, _0, _1)
|
||||
#define VMOVDQA32(_0, _1) INSTR_(vmovdqa32, _0, _1)
|
||||
#define VMOVDQU(_0, _1) INSTR_(vmovdqu, _0, _1)
|
||||
#define VMOVDQA64(_0, _1) INSTR_(vmovdqa64, _0, _1)
|
||||
#define VBROADCASTSS(_0, _1) INSTR_(vbroadcastss, _0, _1)
|
||||
#define VBROADCASTSD(_0, _1) INSTR_(vbroadcastsd, _0, _1)
|
||||
@@ -809,6 +810,7 @@
|
||||
#define vmovhpd(...) VMOVHPD(__VA_ARGS__)
|
||||
#define vmovdqa(_0, _1) VMOVDQA(_0, _1)
|
||||
#define vmovdqa32(_0, _1) VMOVDQA32(_0, _1)
|
||||
#define vmovdqu(_0, _1) VMOVDQU(_0, _1)
|
||||
#define vmovdqa64(_0, _1) VMOVDQA64(_0, _1)
|
||||
#define vbroadcastss(_0, _1) VBROADCASTSS(_0, _1)
|
||||
#define vbroadcastsd(_0, _1) VBROADCASTSD(_0, _1)
|
||||
@@ -911,6 +913,7 @@
|
||||
#define VCOMISS(_0, _1) INSTR_(vcomiss, _0, _1)
|
||||
#define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1)
|
||||
|
||||
#define VMASKMOVPD(_0, _1, _2) INSTR_(vmaskmovpd, _0, _1, _2)
|
||||
#define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2)
|
||||
#define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2)
|
||||
#define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2)
|
||||
@@ -1236,7 +1239,7 @@
|
||||
#define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3)
|
||||
#define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2)
|
||||
#define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2)
|
||||
|
||||
#define vmaskmovpd(_0, _1, _2) VMASKMOVPD(_0, _1, _2)
|
||||
// Prefetches
|
||||
|
||||
#define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1)
|
||||
|
||||
Reference in New Issue
Block a user