mirror of
https://github.com/amd/blis.git
synced 2026-05-11 01:30:00 +00:00
Details: - Imported the 24x16 knl sgemm microkernel (and its corresonding spackm kernel) from TBLIS and enabled its use in the knl sub-config. Also Added sgemm microkernel prototype to bli_kernels_knl.h. - Updated dgemm and dpackm microkernels from TBLIS, which included an important change regarding the offsets array (changed from extern declaration to static declaration/definition). - Activated use of level-1v and -1f zen kernels in skx and knl sub-configs. - Removed some old macros no longer needed in bli_family_skx.h now that libmemkind support exists in configure. - Moved bli_avx512_macros.h to frame/include and adjusted #includes in skx and knl kernels accordingly. - Moved unused kernels in kernels/knl/3 to kernels/knl/3/other directory. - Fixed a minor bug in the 'make' output per compile when verboseness is not turned on. The rule-generating function 'make-kernel-rule' was previously passing in the name of the config, rather than the name of the kernel set returned by get-config-for-kset, which could give misleading information to the user when the kconfig_map mapped a kernel set to a sub-configuration that did not share the same name. (This didn't affect the CFLAGS that were actually used.) - Updated test/3m4m/Makefile, removing acml targets and renaming the remaining targets.
174 lines
6.3 KiB
C
174 lines
6.3 KiB
C
#ifndef BLIS_AVX512_MACROS_H
|
|
#define BLIS_AVX512_MACROS_H
|
|
|
|
//
|
|
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
|
|
//
|
|
|
|
#define COMMENT_BEGIN "#"
|
|
#define COMMENT_END
|
|
|
|
#define STRINGIFY(...) #__VA_ARGS__
|
|
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
|
|
#define LABEL(label) STRINGIFY(label) ":\n\t"
|
|
|
|
#define XMM(x) %%xmm##x
|
|
#define YMM(x) %%ymm##x
|
|
#define ZMM(x) %%zmm##x
|
|
#define EAX %%eax
|
|
#define EBX %%ebx
|
|
#define ECX %%ecx
|
|
#define EDX %%edx
|
|
#define EBP %%ebp
|
|
#define EDI %%edi
|
|
#define ESI %%esi
|
|
#define RAX %%rax
|
|
#define RBX %%rbx
|
|
#define RCX %%rcx
|
|
#define RDX %%rdx
|
|
#define RBP %%rbp
|
|
#define RDI %%rdi
|
|
#define RSI %%rsi
|
|
#define K(x) %%k##x
|
|
#define R(x) %%r##x
|
|
#define R8 %%r8
|
|
#define R9 %%r9
|
|
#define R10 %%r10
|
|
#define R11 %%r11
|
|
#define R12 %%r12
|
|
#define R13 %%r13
|
|
#define R14 %%r14
|
|
#define R15 %%r15
|
|
#define RD(x) %%r##x##d
|
|
#define R8D %%r8d
|
|
#define R9D %%r9d
|
|
#define R10D %%r10d
|
|
#define R11D %%r11d
|
|
#define R12D %%r12d
|
|
#define R13D %%r13d
|
|
#define R14D %%r14d
|
|
#define R15D %%r15d
|
|
#define IMM(x) $##x
|
|
#define VAR(x) %[x]
|
|
|
|
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
|
|
#define MEM_3(reg,off,scale) (reg,off,scale)
|
|
#define MEM_2(reg,disp) disp(reg)
|
|
#define MEM_1(reg) (reg)
|
|
|
|
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
|
|
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
|
|
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
|
|
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
|
|
|
|
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
|
|
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
|
|
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
|
|
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
|
|
|
|
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
|
|
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
|
|
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
|
|
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
|
|
|
|
#define MASK_K(n) %{%%k##n%}
|
|
#define MASK_KZ(n) %{%%k##n%}%{z%}
|
|
#define KMOV(to,from) ASM(kmovw from, to)
|
|
#define JKNZD(kreg,label) \
|
|
ASM(kortestw kreg, kreg) \
|
|
ASM(jnz label)
|
|
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
|
|
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
|
|
|
|
#define ALIGN16 ASM(.p2align 4)
|
|
#define ALIGN32 ASM(.p2align 5)
|
|
#define RDTSC ASM(rdstc)
|
|
#define MOV(_0, _1) ASM(mov _1, _0)
|
|
#define MOVD(_0, _1) ASM(movd _1, _0)
|
|
#define MOVL(_0, _1) ASM(movl _1, _0)
|
|
#define MOVQ(_0, _1) ASM(movq _1, _0)
|
|
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
|
|
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
|
|
#define CMP(_0, _1) ASM(cmp _1, _0)
|
|
#define AND(_0, _1) ASM(and _1, _0)
|
|
#define ADD(_0, _1) ASM(add _1, _0)
|
|
#define SUB(_0, _1) ASM(sub _1, _0)
|
|
#define SAL(_0, _1) ASM(sal _1, _0)
|
|
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
|
|
#define SAR(_0, _1) ASM(sar _1, _0)
|
|
#define SAL1(_0) ASM(sal _0)
|
|
#define SAR1(_0) ASM(sar _0)
|
|
#define LEA(_0, _1) ASM(lea _1, _0)
|
|
#define TEST(_0, _1) ASM(test _1, _0)
|
|
#define DEC(_0) ASM(dec _0)
|
|
#define JLE(_0) ASM(jle _0)
|
|
#define JL(_0) ASM(jl _0)
|
|
#define JNZ(_0) ASM(jnz _0)
|
|
#define JZ(_0) ASM(jz _0)
|
|
#define JNE(_0) ASM(jne _0)
|
|
#define JE(_0) ASM(je _0)
|
|
#define JNC(_0) ASM(jnc _0)
|
|
#define JC(_0) ASM(jc _0)
|
|
#define JMP(_0) ASM(jmp _0)
|
|
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
|
|
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
|
|
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
|
|
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
|
|
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
|
|
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
|
|
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
|
|
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
|
|
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
|
|
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
|
|
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
|
|
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
|
|
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
|
|
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
|
|
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
|
|
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
|
|
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
|
|
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
|
|
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
|
|
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
|
|
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
|
|
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
|
|
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
|
|
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
|
|
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
|
|
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
|
|
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
|
|
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
|
|
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
|
|
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
|
|
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
|
|
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
|
|
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
|
|
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
|
|
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
|
|
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
|
|
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
|
|
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
|
|
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
|
|
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
|
|
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
|
|
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
|
|
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
|
|
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
|
|
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
|
|
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
|
|
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
|
|
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
|
|
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
|
|
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
|
|
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
|
|
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
|
|
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
|
|
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
|
|
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
|
|
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
|
|
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
|
|
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
|
|
#define VZEROUPPER() ASM(vzeroupper)
|
|
|
|
#endif
|