From a7166feb1053814b7dd27f3879ae38acfc9637fc Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 25 Jun 2018 12:09:18 -0500 Subject: [PATCH] Finish macroization of assembly ukernels. --- frame/include/bli_x86_asm_macros.h | 6 +- .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c | 190 +++++++++-------- kernels/knl/1m/bli_dpackm_knl_asm_24x8.c | 9 +- kernels/knl/1m/bli_spackm_knl_asm_24x16.c | 14 +- kernels/knl/3/bli_dgemm_knl_asm_24x8.c | 2 +- kernels/knl/3/bli_sgemm_knl_asm_24x16.c | 8 +- kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c | 97 ++++----- .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c | 54 ++--- .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c | 51 +++-- kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c | 26 +-- kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c | 26 +-- .../3/bli_gemm_piledriver_asm_d8x3.c | 200 +++++++++--------- .../3/bli_gemm_sandybridge_asm_d8x4.c | 198 ++++++++--------- kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 2 +- kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 2 +- kernels/zen/3/bli_gemm_zen_asm_d6x8.c | 184 ++++++++-------- kernels/zen/3/bli_gemm_zen_asm_d8x6.c | 184 ++++++++-------- kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c | 96 ++++----- kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c | 98 +++++---- 19 files changed, 726 insertions(+), 721 deletions(-) diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index 5379b87bd..792dc46c1 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -98,7 +98,7 @@ #define COMMENT_BEGIN "#" #define COMMENT_END -#define BEGIN_ASM __asm__ volatile ( +#define BEGIN_ASM() __asm__ volatile ( #define END_ASM(...) __VA_ARGS__ ); @@ -149,8 +149,8 @@ #endif -#define begin_asm BEGIN_ASM -#define end_asm END_ASM +#define begin_asm() BEGIN_ASM() +#define end_asm(...) END_ASM(__VA_ARGS__) #define label(...) LABEL(__VA_ARGS__) #define imm(...) IMM(__VA_ARGS__) diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c index 059d54e64..8b8771226 100644 --- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c +++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c @@ -107,18 +107,17 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilps(imm(0x4e), ymm2, ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; @@ -142,7 +141,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 vxorps(ymm15, ymm15, ymm15) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -251,7 +250,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -348,8 +347,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 // ab61 ab63 ab65 ab67 // ab71 ) ab73 ) ab75 ) ab77 ) - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm4) // load beta and duplicate @@ -365,7 +364,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -740,19 +739,20 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 label(.SDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -761,7 +761,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } #undef KERNEL4x6_1 @@ -879,13 +879,12 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ - ( + begin_asm() vzeroall() - mov(%3, rbx) // load address of b. - mov(%2, rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(a), rax) // load address of a. prefetch(0, mem(rax, 64)) @@ -895,7 +894,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 add(imm(12*8), rbx) add(imm(8*8), rax) - mov(%0, rsi) // i = k_iter; notice %0 not $0 + mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0 test(rsi, rsi) je(.CONSIDERKLEFT) @@ -920,7 +919,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 label(.CONSIDERKLEFT) - mov(%1, rsi) + mov(var(k_left), rsi) test(rsi, rsi) label(.LOOPKLEFT) je(.POSTACCUM) @@ -935,11 +934,11 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 label(.POSTACCUM) - mov(%7, rsi) // load cs_c - mov(%8, rdi) // load rs_c - vmovddup(mem(%4), xmm2) //load alpha - vmovddup(mem(%5), xmm3) //load beta - mov(%6, rcx) // load address of c + mov(var(rs_c), rsi) // load cs_c + mov(var(cs_c), rdi) // load rs_c + vmovddup(mem(var(alpha)), xmm2) //load alpha + vmovddup(mem(var(beta)), xmm3) //load beta + mov(var(c), rcx) // load address of c sal(imm(3), rsi) // cs_c *= sizeof(double) sal(imm(3), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), rdx) @@ -1034,17 +1033,20 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 vmovhpd(xmm14, mem(rdx, rdi, 1)) vmovhpd(xmm15, mem(r8, rdi, 1)) + end_asm( : // output operands (none) : // input operands - "r" (k_iter), - "r" (k_left), - "r" (a), - "r" (b), - "r" (alpha), - "r" (beta), - "r" (c), - "m" (rs_c), - "m" (cs_c) + [k_iter] "r" (k_iter), // 0 + [k_left] "r" (k_left), // 1 + [a] "r" (a), // 2 + [b] "r" (b), // 3 + [alpha] "r" (alpha), // 4 + [beta] "r" (beta), // 5 + [c] "r" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "xmm0", "xmm1", "xmm2", "xmm3", @@ -1052,7 +1054,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } //The parameter "i" is the iteration number, i.e. the B values to read #define MADD_TO_YMM(i) \ @@ -1094,21 +1096,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - //mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) vpermilps(imm(0x4e), ymm2, ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -1126,7 +1127,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1266,7 +1267,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 label(.CCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1377,7 +1378,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate @@ -1424,7 +1425,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate @@ -1434,7 +1435,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -1835,19 +1836,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 label(.CDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next)/*, // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1856,7 +1858,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" - ); + ) } #define MADDSUBPD_TO_YMM \ @@ -1905,21 +1907,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - //mov(%9, r15) // load address of b_next. - //mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + //mov(var(b_next), r15) // load address of b_next. + //mov(var(a_next), r14) // load address of a_next. vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovddup(mem(rbx, 0+0*32), ymm2) vmovddup(mem(rbx, 0+1*32), ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -1939,7 +1940,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vxorpd(ymm15, ymm15, ymm15) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -2083,7 +2084,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 label(.ZCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -2176,7 +2177,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate @@ -2190,13 +2191,13 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 Z_ALPHA(9, 1) Z_ALPHA(8, 0) - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; @@ -2508,19 +2509,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 label(.ZDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2529,6 +2531,6 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" - ); + ) } diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c index 29ee9cdc9..0d8996671 100644 --- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c +++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c @@ -127,7 +127,7 @@ void bli_dpackm_knl_asm_8xk const int64_t lda = lda_; const int64_t ldp = ldp_; - BEGIN_ASM + BEGIN_ASM() MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) @@ -321,8 +321,8 @@ void bli_dpackm_knl_asm_24xk const int64_t lda = lda_; const int64_t ldp = ldp_; - __asm__ volatile - ( + BEGIN_ASM() + MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) @@ -524,6 +524,7 @@ void bli_dpackm_knl_asm_24xk LABEL(PACK24_DONE) + END_ASM( : //output operands : //input operands [n] "m" (n), @@ -543,5 +544,5 @@ void bli_dpackm_knl_asm_24xk "zmm30", "zmm31", "rax", "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" - ); + ) } diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c index 961df18ef..f9c69d008 100644 --- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c +++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c @@ -129,8 +129,8 @@ void bli_spackm_knl_asm_16xk const int64_t lda = lda_; const int64_t ldp = ldp_; - __asm__ volatile - ( + BEGIN_ASM() + MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) @@ -295,6 +295,7 @@ void bli_spackm_knl_asm_16xk LABEL(PACK16_DONE) + END_ASM( : //output operands : //input operands [n] "m" (n), @@ -314,7 +315,7 @@ void bli_spackm_knl_asm_16xk "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" - ); + ) } void bli_spackm_knl_asm_24xk @@ -338,8 +339,8 @@ void bli_spackm_knl_asm_24xk const int64_t lda = lda_; const int64_t ldp = ldp_; - __asm__ volatile - ( + BEGIN_ASM() + MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) @@ -540,6 +541,7 @@ void bli_spackm_knl_asm_24xk LABEL(PACK24_DONE) + END_ASM( : //output operands : //input operands [n] "m" (n), @@ -559,5 +561,5 @@ void bli_spackm_knl_asm_24xk "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" - ); + ) } diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index 196372c1a..a135b4f27 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -213,7 +213,7 @@ void bli_dgemm_knl_asm_24x8 int tlooph, tloopl, blooph, bloopl; #endif - BEGIN_ASM + BEGIN_ASM() #ifdef MONITORS RDTSC diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c index 41e4b12aa..620b436ee 100644 --- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -210,8 +210,8 @@ void bli_sgemm_knl_asm_24x16 int tlooph, tloopl, blooph, bloopl; #endif - __asm__ volatile - ( + BEGIN_ASM() + #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) @@ -664,6 +664,8 @@ void bli_sgemm_knl_asm_24x16 MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif + + END_ASM( : // output operands #ifdef MONITORS [topl] "=m" (topl), @@ -694,7 +696,7 @@ void bli_sgemm_knl_asm_24x16 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" - ); + ) #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c index 98486a2e8..84a998019 100644 --- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c @@ -59,13 +59,12 @@ void bli_sgemm_penryn_asm_8x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r9) // load address of b_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r9) // load address of b_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. @@ -74,8 +73,8 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -102,7 +101,7 @@ void bli_sgemm_penryn_asm_8x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -252,7 +251,7 @@ void bli_sgemm_penryn_asm_8x4 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -307,15 +306,15 @@ void bli_sgemm_penryn_asm_8x4 addps(xmm5, xmm15) - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6 movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7 pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) @@ -821,18 +820,20 @@ void bli_sgemm_penryn_asm_8x4 label(.SDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), - "m" (k_left), - "m" (a), - "m" (b), - "m" (alpha), - "m" (beta), - "m" (c), - "m" (rs_c), - "m" (cs_c), - "m" (b_next) + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", @@ -840,7 +841,7 @@ void bli_sgemm_penryn_asm_8x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_dgemm_penryn_asm_4x4 @@ -865,14 +866,13 @@ void bli_dgemm_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r9) // load address of b_next. - mov(%10, r11) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r9) // load address of b_next. + mov(var(a_next), r11) // load address of a_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. @@ -881,8 +881,8 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -909,7 +909,7 @@ void bli_dgemm_penryn_asm_4x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1076,7 +1076,7 @@ void bli_dgemm_penryn_asm_4x4 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1132,13 +1132,13 @@ void bli_dgemm_penryn_asm_4x4 addpd(xmm6, xmm14) - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta movddup(mem(rax), xmm6) // load alpha and duplicate movddup(mem(rbx), xmm7) // load beta and duplicate - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) @@ -1467,19 +1467,20 @@ void bli_dgemm_penryn_asm_4x4 label(.DDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next), // 9 - "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", @@ -1487,7 +1488,7 @@ void bli_dgemm_penryn_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c index 992d17967..ac8659396 100644 --- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c @@ -76,12 +76,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a10. - mov(%4, rbx) // load address of b01. - //mov(%10, r9) // load address of b_next. + mov(var(a10), rax) // load address of a10. + mov(var(b01), rbx) // load address of b01. + //mov(var(b_next), r9) // load address of b_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. @@ -90,8 +89,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - //mov(%6, rcx) // load address of c11 - //mov(%9, rdi) // load cs_c + //mov(var(c11), rcx) // load address of c11 + //mov(var(rs_c), rdi) // load cs_c //lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c; @@ -117,7 +116,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -270,7 +269,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 label(.CONSIDERKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -327,7 +326,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 - mov(%5, rbx) // load address of b11. + mov(var(b11), rbx) // load address of b11. // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 @@ -361,7 +360,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - mov(%9, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate movaps(mem(rbx, 0*16), xmm8) @@ -400,11 +399,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 label(.TRSM) - mov(%3, rax) // load address of a11 - mov(%6, rcx) // load address of c11 + mov(var(a11), rax) // load address of a11 + mov(var(c11), rcx) // load address of c11 - mov(%7, rsi) // load rs_c - mov(%8, rdi) // load cs_c + mov(var(rs_c), rsi) // load rs_c + mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) @@ -519,19 +518,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a10), // 2 - "m" (a11), // 3 - "m" (b01), // 4 - "m" (b11), // 5 - "m" (c11), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (alpha), // 9 - "m" (b_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [a11] "m" (a11), // 3 + [b01] "m" (b01), // 4 + [b11] "m" (b11), // 5 + [c11] "m" (c11), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [alpha] "m" (alpha), // 9 + [b_next] "m" (b_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10", "xmm0", "xmm1", "xmm2", "xmm3", @@ -539,7 +539,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c index c0b94269a..3d6e8a0d4 100644 --- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c @@ -76,12 +76,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a12. - mov(%4, rbx) // load address of b21. - //mov(%10, r9) // load address of b_next. + mov(var(a12), rax) // load address of a12. + mov(var(b21), rbx) // load address of b21. + //mov(var(b_next), r9) // load address of b_next. add(imm(8*16), rax) // increment pointers to allow byte add(imm(8*16), rbx) // offsets in the unrolled iterations. @@ -106,7 +105,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -253,7 +252,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 label(.CONSIDERKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -310,7 +309,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 - mov(%5, rbx) // load address of b11. + mov(var(b11), rbx) // load address of b11. // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 @@ -344,7 +343,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - mov(%9, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate movaps(mem(rbx, 0*16), xmm8) @@ -383,11 +382,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 label(.TRSM) - mov(%3, rax) // load address of a11 - mov(%6, rcx) // load address of c11 + mov(var(a11), rax) // load address of a11 + mov(var(c11), rcx) // load address of c11 - mov(%7, rsi) // load rs_c - mov(%8, rdi) // load cs_c + mov(var(rs_c), rsi) // load rs_c + mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) @@ -504,20 +503,20 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 - + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a12), // 2 - "m" (a11), // 3 - "m" (b21), // 4 - "m" (b11), // 5 - "m" (c11), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (alpha), // 9 - "m" (b_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a12] "m" (a12), // 2 + [a11] "m" (a11), // 3 + [b21] "m" (b21), // 4 + [b11] "m" (b11), // 5 + [c11] "m" (c11), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [alpha] "m" (alpha), // 9 + [b_next] "m" (b_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", @@ -525,7 +524,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c index ab8c846bb..39a06b538 100644 --- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c @@ -64,10 +64,9 @@ void bli_dtrsm_l_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%1, rbx) // load address of b11. + mov(var(b11), rbx) // load address of b11. movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) @@ -80,11 +79,11 @@ void bli_dtrsm_l_penryn_asm_4x4 - mov(%0, rax) // load address of a11 - mov(%2, rcx) // load address of c11 + mov(var(a11), rax) // load address of a11 + mov(var(c11), rcx) // load address of c11 - mov(%3, rsi) // load rs_c - mov(%4, rdi) // load cs_c + mov(var(rs_c), rsi) // load rs_c + mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) @@ -199,13 +198,14 @@ void bli_dtrsm_l_penryn_asm_4x4 + end_asm( : // output operands (none) : // input operands - "m" (a11), // 0 - "m" (b11), // 1 - "m" (c11), // 2 - "m" (rs_c), // 3 - "m" (cs_c) // 4 + [a11] "m" (a11), // 0 + [b11] "m" (b11), // 1 + [c11] "m" (c11), // 2 + [rs_c] "m" (rs_c), // 3 + [cs_c] "m" (cs_c) // 4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10", "xmm0", "xmm1", "xmm2", "xmm3", @@ -213,7 +213,7 @@ void bli_dtrsm_l_penryn_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c index de78c59db..290aa27e0 100644 --- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c @@ -64,10 +64,9 @@ void bli_dtrsm_u_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%1, rbx) // load address of b11. + mov(var(b11), rbx) // load address of b11. movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) @@ -80,11 +79,11 @@ void bli_dtrsm_u_penryn_asm_4x4 - mov(%0, rax) // load address of a11 - mov(%2, rcx) // load address of c11 + mov(var(a11), rax) // load address of a11 + mov(var(c11), rcx) // load address of c11 - mov(%3, rsi) // load rs_c - mov(%4, rdi) // load cs_c + mov(var(rs_c), rsi) // load rs_c + mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) @@ -202,13 +201,14 @@ void bli_dtrsm_u_penryn_asm_4x4 + end_asm( : // output operands (none) : // input operands - "m" (a11), // 0 - "m" (b11), // 1 - "m" (c11), // 2 - "m" (rs_c), // 3 - "m" (cs_c) // 4 + [a11] "m" (a11), // 0 + [b11] "m" (b11), // 1 + [c11] "m" (c11), // 2 + [rs_c] "m" (rs_c), // 3 + [cs_c] "m" (cs_c) // 4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", @@ -216,7 +216,7 @@ void bli_dtrsm_u_penryn_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c index 82866f1fd..4174c1527 100644 --- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c +++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c @@ -62,14 +62,13 @@ void bli_sgemm_piledriver_asm_16x3 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + mov(var(a_next), r14) // load address of a_next. prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b @@ -78,8 +77,8 @@ void bli_sgemm_piledriver_asm_16x3 add(imm(32*4), rax) add(imm(12*4), rbx) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; @@ -103,7 +102,7 @@ void bli_sgemm_piledriver_asm_16x3 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -313,7 +312,7 @@ void bli_sgemm_piledriver_asm_16x3 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -393,8 +392,8 @@ void bli_sgemm_piledriver_asm_16x3 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm2) // load beta and duplicate @@ -419,7 +418,7 @@ void bli_sgemm_piledriver_asm_16x3 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -887,19 +886,20 @@ void bli_sgemm_piledriver_asm_16x3 label(.SDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next), // 9 - "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -908,7 +908,7 @@ void bli_sgemm_piledriver_asm_16x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_dgemm_piledriver_asm_8x3 @@ -933,14 +933,13 @@ void bli_dgemm_piledriver_asm_8x3 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + mov(var(a_next), r14) // load address of a_next. prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b @@ -949,8 +948,8 @@ void bli_dgemm_piledriver_asm_8x3 add(imm(16*8), rax) add(imm(12*8), rbx) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; @@ -974,7 +973,7 @@ void bli_dgemm_piledriver_asm_8x3 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1183,7 +1182,7 @@ void bli_dgemm_piledriver_asm_8x3 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done. // else, we prepare to @@ -1253,8 +1252,8 @@ void bli_dgemm_piledriver_asm_8x3 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vmovddup(mem(rax), xmm0) // load alpha and duplicate vmovddup(mem(rbx), xmm2) // load beta and duplicate @@ -1278,7 +1277,7 @@ void bli_dgemm_piledriver_asm_8x3 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -1606,19 +1605,20 @@ void bli_dgemm_piledriver_asm_8x3 label(.DDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next), // 9 - "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1627,7 +1627,7 @@ void bli_dgemm_piledriver_asm_8x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_cgemm_piledriver_asm_4x2 @@ -1652,17 +1652,16 @@ void bli_cgemm_piledriver_asm_4x2 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + mov(var(a_next), r14) // load address of a_next. - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; @@ -1682,7 +1681,7 @@ void bli_cgemm_piledriver_asm_4x2 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1848,7 +1847,7 @@ void bli_cgemm_piledriver_asm_4x2 label(.CCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1925,7 +1924,7 @@ void bli_cgemm_piledriver_asm_4x2 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate @@ -1952,7 +1951,7 @@ void bli_cgemm_piledriver_asm_4x2 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate @@ -1962,7 +1961,7 @@ void bli_cgemm_piledriver_asm_4x2 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) @@ -2154,19 +2153,20 @@ void bli_cgemm_piledriver_asm_4x2 label(.CDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next), // 9 - "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2175,7 +2175,7 @@ void bli_cgemm_piledriver_asm_4x2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_zgemm_piledriver_asm_2x2 @@ -2200,17 +2200,16 @@ void bli_zgemm_piledriver_asm_2x2 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + mov(var(a_next), r14) // load address of a_next. - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; @@ -2229,7 +2228,7 @@ void bli_zgemm_piledriver_asm_2x2 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -2399,7 +2398,7 @@ void bli_zgemm_piledriver_asm_2x2 label(.ZCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -2473,7 +2472,7 @@ void bli_zgemm_piledriver_asm_2x2 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vmovddup(mem(rax), xmm0) // load alpha_r and duplicate vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate @@ -2500,7 +2499,7 @@ void bli_zgemm_piledriver_asm_2x2 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vmovddup(mem(rbx), xmm6) // load beta_r and duplicate vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate @@ -2510,7 +2509,7 @@ void bli_zgemm_piledriver_asm_2x2 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) //lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; @@ -2688,19 +2687,20 @@ void bli_zgemm_piledriver_asm_2x2 label(.ZDONE) + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next), // 9 - "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2709,7 +2709,7 @@ void bli_zgemm_piledriver_asm_2x2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c index 20946c3c5..1532664d7 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c @@ -62,20 +62,19 @@ void bli_sgemm_sandybridge_asm_8x8 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - //mov(%9, r15) // load address of b_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + //mov(var(b_next), r15) // load address of b_next. vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilps(imm(0x4e), ymm2, ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; @@ -100,7 +99,7 @@ void bli_sgemm_sandybridge_asm_8x8 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -248,7 +247,7 @@ void bli_sgemm_sandybridge_asm_8x8 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -393,8 +392,8 @@ void bli_sgemm_sandybridge_asm_8x8 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm4) // load beta and duplicate @@ -412,7 +411,7 @@ void bli_sgemm_sandybridge_asm_8x8 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -1002,19 +1001,20 @@ void bli_sgemm_sandybridge_asm_8x8 vzeroupper() + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1023,7 +1023,7 @@ void bli_sgemm_sandybridge_asm_8x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_dgemm_sandybridge_asm_8x4 @@ -1048,22 +1048,21 @@ void bli_dgemm_sandybridge_asm_8x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - //mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilpd(imm(0x5), ymm2, ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -1083,7 +1082,7 @@ void bli_dgemm_sandybridge_asm_8x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1228,7 +1227,7 @@ void bli_dgemm_sandybridge_asm_8x4 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1343,8 +1342,8 @@ void bli_dgemm_sandybridge_asm_8x4 // ab70 ) ab71 ) ab72 ) ab73 ) - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate @@ -1362,7 +1361,7 @@ void bli_dgemm_sandybridge_asm_8x4 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -1677,19 +1676,20 @@ void bli_dgemm_sandybridge_asm_8x4 vzeroupper() + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next)/*, // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1698,7 +1698,7 @@ void bli_dgemm_sandybridge_asm_8x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } void bli_cgemm_sandybridge_asm_8x4 @@ -1723,22 +1723,21 @@ void bli_cgemm_sandybridge_asm_8x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - mov(%9, r15) // load address of b_next. - //mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + mov(var(b_next), r15) // load address of b_next. + //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) vpermilps(imm(0x4e), ymm2, ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -1758,7 +1757,7 @@ void bli_cgemm_sandybridge_asm_8x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -2004,7 +2003,7 @@ void bli_cgemm_sandybridge_asm_8x4 label(.CCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -2175,7 +2174,7 @@ void bli_cgemm_sandybridge_asm_8x4 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate @@ -2222,7 +2221,7 @@ void bli_cgemm_sandybridge_asm_8x4 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate @@ -2232,7 +2231,7 @@ void bli_cgemm_sandybridge_asm_8x4 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -2638,19 +2637,20 @@ void bli_cgemm_sandybridge_asm_8x4 vzeroupper() + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c), // 8 - "m" (b_next)/*, // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2659,7 +2659,7 @@ void bli_cgemm_sandybridge_asm_8x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -2686,21 +2686,20 @@ void bli_zgemm_sandybridge_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. - //mov(%9, r15) // load address of b_next. - //mov(%10, r14) // load address of a_next. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. + //mov(var(b_next), r15) // load address of b_next. + //mov(var(a_next), r14) // load address of a_next. vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovddup(mem(rbx, 0+0*32), ymm2) vmovddup(mem(rbx, 0+1*32), ymm3) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; @@ -2721,7 +2720,7 @@ void bli_zgemm_sandybridge_asm_4x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -2964,7 +2963,7 @@ void bli_zgemm_sandybridge_asm_4x4 label(.ZCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -3083,7 +3082,7 @@ void bli_zgemm_sandybridge_asm_4x4 // scale by alpha - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate @@ -3130,7 +3129,7 @@ void bli_zgemm_sandybridge_asm_4x4 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate @@ -3140,7 +3139,7 @@ void bli_zgemm_sandybridge_asm_4x4 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; @@ -3488,19 +3487,20 @@ void bli_zgemm_sandybridge_asm_4x4 vzeroupper() + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -3509,7 +3509,7 @@ void bli_zgemm_sandybridge_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index 54bdb9e1d..d841dad80 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -306,7 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2( const int64_t rs_c = rs_c_; const int64_t cs_c = cs_c_; - BEGIN_ASM + BEGIN_ASM() VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c index a879ada00..39ed53b27 100644 --- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -336,7 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2( const int64_t rs_c = rs_c_; const int64_t cs_c = cs_c_; - BEGIN_ASM + BEGIN_ASM() VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) diff --git a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c b/kernels/zen/3/bli_gemm_zen_asm_d6x8.c index c06ff6f90..f21620263 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemm_zen_asm_d6x8.c @@ -99,14 +99,13 @@ void bli_sgemm_zen_asm_6x16 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) @@ -114,8 +113,8 @@ void bli_sgemm_zen_asm_6x16 vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%7, rdi) // load rs_c + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; @@ -130,7 +129,7 @@ void bli_sgemm_zen_asm_6x16 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -257,7 +256,7 @@ void bli_sgemm_zen_asm_6x16 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -305,8 +304,8 @@ void bli_sgemm_zen_asm_6x16 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate @@ -328,7 +327,7 @@ void bli_sgemm_zen_asm_6x16 - mov(%8, rsi) // load cs_c + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c; @@ -872,19 +871,20 @@ void bli_sgemm_zen_asm_6x16 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -893,7 +893,7 @@ void bli_sgemm_zen_asm_6x16 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -945,14 +945,13 @@ void bli_dgemm_zen_asm_6x8 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) @@ -960,8 +959,8 @@ void bli_dgemm_zen_asm_6x8 vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%7, rdi) // load rs_c + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; @@ -976,7 +975,7 @@ void bli_dgemm_zen_asm_6x8 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1103,7 +1102,7 @@ void bli_dgemm_zen_asm_6x8 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1151,8 +1150,8 @@ void bli_dgemm_zen_asm_6x8 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -1174,7 +1173,7 @@ void bli_dgemm_zen_asm_6x8 - mov(%8, rsi) // load cs_c + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; @@ -1611,19 +1610,20 @@ void bli_dgemm_zen_asm_6x8 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1632,7 +1632,7 @@ void bli_dgemm_zen_asm_6x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -1691,14 +1691,13 @@ void bli_cgemm_zen_asm_3x8 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) @@ -1706,8 +1705,8 @@ void bli_cgemm_zen_asm_3x8 vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%7, rdi) // load rs_c + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; @@ -1720,7 +1719,7 @@ void bli_cgemm_zen_asm_3x8 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1847,7 +1846,7 @@ void bli_cgemm_zen_asm_3x8 label(.CCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1916,7 +1915,7 @@ void bli_cgemm_zen_asm_3x8 - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate @@ -1957,14 +1956,14 @@ void bli_cgemm_zen_asm_3x8 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate - mov(%8, rsi) // load cs_c + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex) lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; @@ -2143,19 +2142,20 @@ void bli_cgemm_zen_asm_3x8 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2164,7 +2164,7 @@ void bli_cgemm_zen_asm_3x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -2219,14 +2219,13 @@ void bli_zgemm_zen_asm_3x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) @@ -2234,8 +2233,8 @@ void bli_zgemm_zen_asm_3x4 vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%7, rdi) // load rs_c + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) @@ -2249,7 +2248,7 @@ void bli_zgemm_zen_asm_3x4 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -2376,7 +2375,7 @@ void bli_zgemm_zen_asm_3x4 label(.ZCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -2444,7 +2443,7 @@ void bli_zgemm_zen_asm_3x4 - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate @@ -2485,14 +2484,14 @@ void bli_zgemm_zen_asm_3x4 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate - mov(%8, rsi) // load cs_c + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c; @@ -2671,19 +2670,20 @@ void bli_zgemm_zen_asm_3x4 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2692,7 +2692,7 @@ void bli_zgemm_zen_asm_3x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c b/kernels/zen/3/bli_gemm_zen_asm_d8x6.c index 46179b1af..20a764671 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c +++ b/kernels/zen/3/bli_gemm_zen_asm_d8x6.c @@ -98,14 +98,13 @@ void bli_sgemm_zen_asm_16x6 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) @@ -113,8 +112,8 @@ void bli_sgemm_zen_asm_16x6 vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; @@ -129,7 +128,7 @@ void bli_sgemm_zen_asm_16x6 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -256,7 +255,7 @@ void bli_sgemm_zen_asm_16x6 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -304,8 +303,8 @@ void bli_sgemm_zen_asm_16x6 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate @@ -327,7 +326,7 @@ void bli_sgemm_zen_asm_16x6 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*rs_c; @@ -614,19 +613,20 @@ void bli_sgemm_zen_asm_16x6 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -635,7 +635,7 @@ void bli_sgemm_zen_asm_16x6 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } #define DGEMM_INPUT_GS_BETA_NZ \ @@ -684,14 +684,13 @@ void bli_dgemm_zen_asm_8x6 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) @@ -699,8 +698,8 @@ void bli_dgemm_zen_asm_8x6 vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; @@ -715,7 +714,7 @@ void bli_dgemm_zen_asm_8x6 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -842,7 +841,7 @@ void bli_dgemm_zen_asm_8x6 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -890,8 +889,8 @@ void bli_dgemm_zen_asm_8x6 - mov(%4, rax) // load address of alpha - mov(%5, rbx) // load address of beta + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -913,7 +912,7 @@ void bli_dgemm_zen_asm_8x6 - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; @@ -1199,19 +1198,20 @@ void bli_dgemm_zen_asm_8x6 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1220,7 +1220,7 @@ void bli_dgemm_zen_asm_8x6 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -1279,14 +1279,13 @@ void bli_cgemm_zen_asm_8x3 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) @@ -1294,8 +1293,8 @@ void bli_cgemm_zen_asm_8x3 vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; @@ -1308,7 +1307,7 @@ void bli_cgemm_zen_asm_8x3 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1435,7 +1434,7 @@ void bli_cgemm_zen_asm_8x3 label(.CCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1504,7 +1503,7 @@ void bli_cgemm_zen_asm_8x3 - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate @@ -1545,14 +1544,14 @@ void bli_cgemm_zen_asm_8x3 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; @@ -1731,19 +1730,20 @@ void bli_cgemm_zen_asm_8x3 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1752,7 +1752,7 @@ void bli_cgemm_zen_asm_8x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -1807,14 +1807,13 @@ void bli_zgemm_zen_asm_4x3 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a), rax) // load address of a. + mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) @@ -1822,8 +1821,8 @@ void bli_zgemm_zen_asm_4x3 vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - mov(%6, rcx) // load address of c - mov(%8, rdi) // load cs_c + mov(var(c), rcx) // load address of c + mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) @@ -1837,7 +1836,7 @@ void bli_zgemm_zen_asm_4x3 - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -1964,7 +1963,7 @@ void bli_zgemm_zen_asm_4x3 label(.ZCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -2032,7 +2031,7 @@ void bli_zgemm_zen_asm_4x3 - mov(%4, rax) // load address of alpha + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate @@ -2073,14 +2072,14 @@ void bli_zgemm_zen_asm_4x3 - mov(%5, rbx) // load address of beta + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate - mov(%7, rsi) // load rs_c + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c; @@ -2259,19 +2258,20 @@ void bli_zgemm_zen_asm_4x3 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -2280,7 +2280,7 @@ void bli_zgemm_zen_asm_4x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c b/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c index f8717384c..2a3c8653a 100644 --- a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c @@ -81,36 +81,35 @@ void bli_sgemmtrsm_l_zen_asm_6x16 float* beta = bli_sm1; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a10), rax) // load address of a. + mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - mov(%7, rcx) // load address of b11 + mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. - mov(%8, r8) // load address of c11 - mov(%9, r9) // load rs_c + mov(var(c11), r8) // load address of c11 + mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) - mov(%10, r10) // load cs_c + mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -237,7 +236,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -286,7 +285,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16 - mov(%5, rbx) // load address of alpha + mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate @@ -365,7 +364,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16 // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - mov(%6, rax) // load address of a11 + mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b @@ -772,19 +771,20 @@ void bli_sgemmtrsm_l_zen_asm_6x16 vzeroupper() + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a10), // 2 - "m" (b01), // 3 - "m" (beta), // 4 - "m" (alpha), // 5 - "m" (a11), // 6 - "m" (b11), // 7 - "m" (c11), // 8 - "m" (rs_c), // 9 - "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -793,7 +793,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -835,36 +835,35 @@ void bli_dgemmtrsm_l_zen_asm_6x8 double* beta = bli_dm1; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a10), rax) // load address of a. + mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - mov(%7, rcx) // load address of b11 + mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. - mov(%8, r8) // load address of c11 - mov(%9, r9) // load rs_c + mov(var(c11), r8) // load address of c11 + mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) - mov(%10, r10) // load cs_c + mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -991,7 +990,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1041,7 +1040,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8 - mov(%5, rbx) // load address of alpha + mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate @@ -1120,7 +1119,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8 // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - mov(%6, rax) // load address of a11 + mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b @@ -1488,19 +1487,20 @@ void bli_dgemmtrsm_l_zen_asm_6x8 + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a10), // 2 - "m" (b01), // 3 - "m" (beta), // 4 - "m" (alpha), // 5 - "m" (a11), // 6 - "m" (b11), // 7 - "m" (c11), // 8 - "m" (rs_c), // 9 - "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1509,7 +1509,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } diff --git a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c b/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c index 2cc742214..c4e84fc3a 100644 --- a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c @@ -81,36 +81,35 @@ void bli_sgemmtrsm_u_zen_asm_6x16 float* beta = bli_sm1; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a10), rax) // load address of a. + mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - mov(%7, rcx) // load address of b11 + mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. - mov(%8, r8) // load address of c11 - mov(%9, r9) // load rs_c + mov(var(c11), r8) // load address of c11 + mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) - mov(%10, r10) // load cs_c + mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -237,7 +236,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16 label(.SCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -286,7 +285,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16 - mov(%5, rbx) // load address of alpha + mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate @@ -365,7 +364,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16 // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - mov(%6, rax) // load address of a11 + mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b @@ -776,20 +775,20 @@ void bli_sgemmtrsm_u_zen_asm_6x16 vzeroupper() - + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a10), // 2 - "m" (b01), // 3 - "m" (beta), // 4 - "m" (alpha), // 5 - "m" (a11), // 6 - "m" (b11), // 7 - "m" (c11), // 8 - "m" (rs_c), // 9 - "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -798,7 +797,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) } @@ -840,36 +839,35 @@ void bli_dgemmtrsm_u_zen_asm_6x8 double* beta = bli_dm1; - __asm__ volatile - ( + begin_asm() vzeroall() // zero all xmm/ymm registers. - mov(%2, rax) // load address of a. - mov(%3, rbx) // load address of b. + mov(var(a10), rax) // load address of a. + mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - mov(%7, rcx) // load address of b11 + mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. - mov(%8, r8) // load address of c11 - mov(%9, r9) // load rs_c + mov(var(c11), r8) // load address of c11 + mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) - mov(%10, r10) // load cs_c + mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) - mov(%0, rsi) // i = k_iter; + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. @@ -996,7 +994,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8 label(.DCONSIDKLEFT) - mov(%1, rsi) // i = k_left; + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. @@ -1046,7 +1044,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8 - mov(%5, rbx) // load address of alpha + mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate @@ -1125,7 +1123,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8 // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - mov(%6, rax) // load address of a11 + mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b @@ -1497,20 +1495,20 @@ void bli_dgemmtrsm_u_zen_asm_6x8 vzeroupper() - + end_asm( : // output operands (none) : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a10), // 2 - "m" (b01), // 3 - "m" (beta), // 4 - "m" (alpha), // 5 - "m" (a11), // 6 - "m" (b11), // 7 - "m" (c11), // 8 - "m" (rs_c), // 9 - "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -1519,7 +1517,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" - ); + ) }