Optimized AVX2 DGEMM SUP and small edge kernels.

- Re-designed the new edge kernels that uses masked load-store
  instructions for handling corner cases.

- Mask load-store instruction macros are added.
  vmovdqu, VMOVDQU for setting up the mask.
  vmaskmovpd, VMASKMOVPD for masked load-store

- Following edge kernels are added for 6x8m dgemm sup.
  n-left edge kernels
  - bli_dgemmsup_rv_haswell_asm_6x7m
  - bli_dgemmsup_rv_haswell_asm_6x5m
  - bli_dgemmsup_rv_haswell_asm_6x3m

  m-left edge kernels
  - bli_dgemmsup_rv_haswell_asm_5x7
  - bli_dgemmsup_rv_haswell_asm_4x7
  - bli_dgemmsup_rv_haswell_asm_3x7
  - bli_dgemmsup_rv_haswell_asm_2x7
  - bli_dgemmsup_rv_haswell_asm_1x7

  - bli_dgemmsup_rv_haswell_asm_5x5
  - bli_dgemmsup_rv_haswell_asm_4x5
  - bli_dgemmsup_rv_haswell_asm_3x5
  - bli_dgemmsup_rv_haswell_asm_2x5
  - bli_dgemmsup_rv_haswell_asm_1x5

  - bli_dgemmsup_rv_haswell_asm_5x3
  - bli_dgemmsup_rv_haswell_asm_4x3
  - bli_dgemmsup_rv_haswell_asm_3x3
  - bli_dgemmsup_rv_haswell_asm_2x3
  - bli_dgemmsup_rv_haswell_asm_1x3

- For 16x3 dgemm_small, m_left computation is handled
  with masked load-store instructions avoid overhead
  of conditional checks for edge cases.

- It improves performance by reducing branching overhead
  and by being more cache friendly.

AMD-Internal: [CPUPL-3574]

Change-Id: I976d6a9209d2a1a02b2830d03d21d200a5aad173
This commit is contained in:
Harsh Dave
2023-07-18 01:25:14 -05:00
parent 758ec3b5ca
commit 5bdf5e2aaa
8 changed files with 10556 additions and 1045 deletions

View File

@@ -776,6 +776,7 @@
#define VMOVHPD(...) INSTR_(vmovhpd, __VA_ARGS__)
#define VMOVDQA(_0, _1) INSTR_(vmovdqa, _0, _1)
#define VMOVDQA32(_0, _1) INSTR_(vmovdqa32, _0, _1)
#define VMOVDQU(_0, _1) INSTR_(vmovdqu, _0, _1)
#define VMOVDQA64(_0, _1) INSTR_(vmovdqa64, _0, _1)
#define VBROADCASTSS(_0, _1) INSTR_(vbroadcastss, _0, _1)
#define VBROADCASTSD(_0, _1) INSTR_(vbroadcastsd, _0, _1)
@@ -809,6 +810,7 @@
#define vmovhpd(...) VMOVHPD(__VA_ARGS__)
#define vmovdqa(_0, _1) VMOVDQA(_0, _1)
#define vmovdqa32(_0, _1) VMOVDQA32(_0, _1)
#define vmovdqu(_0, _1) VMOVDQU(_0, _1)
#define vmovdqa64(_0, _1) VMOVDQA64(_0, _1)
#define vbroadcastss(_0, _1) VBROADCASTSS(_0, _1)
#define vbroadcastsd(_0, _1) VBROADCASTSD(_0, _1)
@@ -911,6 +913,7 @@
#define VCOMISS(_0, _1) INSTR_(vcomiss, _0, _1)
#define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1)
#define VMASKMOVPD(_0, _1, _2) INSTR_(vmaskmovpd, _0, _1, _2)
#define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2)
#define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2)
#define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2)
@@ -1236,7 +1239,7 @@
#define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3)
#define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2)
#define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2)
#define vmaskmovpd(_0, _1, _2) VMASKMOVPD(_0, _1, _2)
// Prefetches
#define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1)