From 238d9fda9e2bb9e0199b266d88255173001f97fa Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Mon, 20 Mar 2023 14:35:02 -0500 Subject: [PATCH] Fixed ASAN memory issue due to modifying RBP register - RBP is base pointer which points to base of current stack frame. ASAN tool rely on rbp and rsp for stack related validations. So over-writting or modifying RBP register results in application termination with the error code of stack overflow. - Removed all the code snippets which were using rbp register for prefetching matrices and sometimes loading elements from memory in all of the gemm sup kernels for double datatype. - Removed reference to rbp from register clobber list as well to completely avoid the usage of rbp register. AMD-Internal: [CPUPL-2613, CPUPL-2587] Change-Id: Idd402d3c644c4dd66e8d4988aede539ad8c77b28 --- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c | 742 +++++++++--------- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c | 621 ++++++++------- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 114 +-- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c | 14 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c | 10 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 5 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c | 7 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 26 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 14 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 14 +- 10 files changed, 780 insertions(+), 787 deletions(-) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c index 990358db8..a16bd3638 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -986,14 +986,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -1004,14 +1003,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -1029,7 +1028,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) @@ -1089,6 +1088,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -1154,7 +1154,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -1279,14 +1278,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -1296,14 +1294,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -1323,7 +1321,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -1387,6 +1385,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -1449,14 +1448,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -1467,14 +1465,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -1493,7 +1491,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) @@ -1564,6 +1562,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -1635,7 +1634,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1744,15 +1743,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -1763,14 +1761,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_2x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_2x4(rax, rbx) @@ -1789,7 +1787,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) dec(rsi) // i -= 1; @@ -1859,6 +1857,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -1927,7 +1926,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2043,15 +2042,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -2064,14 +2062,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -2091,7 +2089,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -2160,6 +2158,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -2242,7 +2241,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -2386,15 +2384,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -2406,14 +2403,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -2433,7 +2430,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -2502,6 +2499,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -2568,15 +2566,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -2589,7 +2586,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -2649,7 +2646,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -2720,7 +2717,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -2839,6 +2836,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -2910,7 +2908,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3000,15 +2998,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -3021,7 +3018,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -3071,7 +3068,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -3132,7 +3129,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -3241,6 +3238,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -3309,7 +3307,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3419,15 +3417,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -3439,14 +3436,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 SUBITER_K4_3x4(rax, rbx) @@ -3466,7 +3463,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -3535,6 +3532,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -3601,15 +3599,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -3622,7 +3619,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -3632,7 +3629,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -3653,7 +3650,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -3722,6 +3719,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -3812,15 +3810,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -3833,7 +3830,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -3844,7 +3841,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -3866,7 +3863,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -3936,6 +3933,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -4002,15 +4000,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -4023,7 +4020,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -4034,7 +4031,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -4056,7 +4053,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -4126,6 +4123,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -4193,7 +4191,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -4303,15 +4301,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -4324,7 +4321,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -4334,7 +4331,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -4355,7 +4352,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -4424,6 +4421,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -4498,15 +4496,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -4519,7 +4516,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_1x4(rax, rbx) // ---------------------------------- iteration 1 @@ -4529,7 +4526,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_1x4(rax, rbx) // ---------------------------------- iteration 3 @@ -4550,7 +4547,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_1x4(rax, rbx) dec(rsi) // i -= 1; @@ -4593,6 +4590,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -4646,7 +4644,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -4757,15 +4755,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -4778,7 +4775,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -4788,7 +4785,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -4809,7 +4806,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -4878,6 +4875,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -4944,15 +4942,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -4965,7 +4962,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -4975,7 +4972,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -4996,7 +4993,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -5065,6 +5062,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -5151,15 +5149,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -5172,7 +5169,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -5182,7 +5179,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -5203,7 +5200,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -5272,6 +5269,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -5338,15 +5336,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -5359,7 +5356,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -5369,7 +5366,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -5390,7 +5387,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -5459,6 +5456,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -5526,7 +5524,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -5635,15 +5633,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -5656,7 +5653,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) // ---------------------------------- iteration 1 @@ -5666,7 +5663,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) // ---------------------------------- iteration 3 @@ -5687,7 +5684,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_2x4(rax, rbx) dec(rsi) // i -= 1; @@ -5756,6 +5753,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -5830,15 +5828,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -5851,7 +5848,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -5861,7 +5858,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -5882,7 +5879,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -5952,6 +5949,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -6018,15 +6016,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -6039,7 +6036,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -6049,7 +6046,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -6070,7 +6067,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -6139,6 +6136,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -6214,7 +6212,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -6324,15 +6322,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -6345,7 +6342,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -6355,7 +6352,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -6376,7 +6373,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -6446,6 +6443,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -6516,15 +6514,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -6537,7 +6534,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -6547,7 +6544,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -6568,7 +6565,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -6637,6 +6634,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -6728,14 +6726,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -6748,7 +6745,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -6797,7 +6794,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -6857,7 +6854,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -6953,6 +6950,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -7016,7 +7014,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -7127,15 +7125,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -7148,7 +7145,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -7158,7 +7155,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -7179,7 +7176,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -7248,6 +7245,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -7314,15 +7312,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -7335,7 +7332,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -7345,7 +7342,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -7366,7 +7363,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -7435,6 +7432,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -7517,15 +7515,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -7538,7 +7535,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -7548,7 +7545,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -7569,7 +7566,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -7638,6 +7635,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -7708,15 +7706,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -7729,7 +7726,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -7739,7 +7736,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -7760,7 +7757,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -7829,6 +7826,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -7896,7 +7894,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -8006,15 +8004,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -8027,7 +8024,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; @@ -8065,7 +8062,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; @@ -8114,7 +8111,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; @@ -8187,6 +8184,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -8241,15 +8239,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -8262,7 +8259,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -8272,7 +8269,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -8293,7 +8290,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -8362,6 +8359,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -8433,7 +8431,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -8543,15 +8541,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -8564,7 +8561,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -8574,7 +8571,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -8595,7 +8592,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -8664,6 +8661,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -8729,14 +8727,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -8749,7 +8746,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -8759,7 +8756,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -8780,7 +8777,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -8848,6 +8845,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -8926,14 +8924,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -8946,7 +8943,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -8956,7 +8953,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -8977,7 +8974,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) dec(rsi) // i -= 1; @@ -9046,6 +9043,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -9113,14 +9111,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -9133,7 +9130,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 1 @@ -9142,7 +9139,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) // ---------------------------------- iteration 3 @@ -9163,7 +9160,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a SUBITER_K4_3x4(rax, rbx) @@ -9233,6 +9230,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -9299,7 +9297,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -9419,14 +9417,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -9439,7 +9436,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -9489,7 +9486,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -9549,7 +9546,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -9645,6 +9642,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -9707,7 +9705,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -9833,14 +9831,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -9853,7 +9850,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -9903,7 +9900,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -9964,7 +9961,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) @@ -10060,6 +10057,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -10121,7 +10119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -10176,14 +10174,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -10196,7 +10193,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10256,7 +10253,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10327,7 +10324,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10445,6 +10442,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -10512,14 +10510,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -10532,7 +10529,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10592,7 +10589,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10663,7 +10660,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10780,6 +10777,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -10858,14 +10856,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -10878,7 +10875,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10922,7 +10919,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -10977,7 +10974,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; @@ -11050,6 +11047,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -11101,14 +11099,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -11121,7 +11118,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11181,7 +11178,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11252,7 +11249,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11370,6 +11367,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -11438,7 +11436,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -11549,14 +11547,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -11569,7 +11566,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11629,7 +11626,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11700,7 +11697,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11818,6 +11815,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -11885,14 +11883,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -11905,7 +11902,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -11965,7 +11962,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12036,7 +12033,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12154,6 +12151,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -12233,15 +12231,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -12254,7 +12251,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12298,7 +12295,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12353,7 +12350,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; @@ -12436,6 +12433,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -12488,14 +12486,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L vmovapd( ymm4, ymm14) vmovapd( ymm4, ymm15) - lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; - prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + lea(mem(r8, r8, 4), rcx) // rcx = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. @@ -12508,7 +12505,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12568,7 +12565,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12639,7 +12636,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a - prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a + prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) @@ -12756,6 +12753,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta + lea(mem(r12), rcx) // rcx = c_iijj; vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate @@ -12824,7 +12822,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -13979,7 +13977,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c index cd00e1976..5908d80f2 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -315,21 +315,20 @@ void bli_dgemmsup_rd_haswell_asm_6x8n prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif - //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -363,7 +362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -399,7 +398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -466,32 +465,32 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -513,21 +512,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -535,12 +534,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -562,22 +561,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -616,7 +615,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -624,73 +623,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - jmp(.DDONE) // jump to end. - - - - - label(.DBETAZERO) - - + + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -712,7 +711,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -735,7 +734,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -838,7 +837,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -856,7 +855,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -909,21 +908,20 @@ void bli_dgemmsup_rd_haswell_asm_3x8n prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif - //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -957,7 +955,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -993,7 +991,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1060,32 +1058,32 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1107,21 +1105,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1129,12 +1127,12 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1156,22 +1154,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1209,7 +1206,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1218,73 +1215,73 @@ void bli_dgemmsup_rd_haswell_asm_3x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - jmp(.DDONE) // jump to end. - - - - - label(.DBETAZERO) - - + + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -1300,7 +1297,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -1323,7 +1320,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1427,7 +1424,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1445,7 +1442,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1493,21 +1490,20 @@ void bli_dgemmsup_rd_haswell_asm_2x8n prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif - //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1536,7 +1532,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 1 @@ -1567,7 +1563,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1624,31 +1620,31 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1666,21 +1662,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1688,11 +1684,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1710,21 +1706,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1751,7 +1747,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1759,65 +1755,65 @@ void bli_dgemmsup_rd_haswell_asm_2x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - - jmp(.DDONE) // jump to end. - - - - - label(.DBETAZERO) - - + + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -1833,7 +1829,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -1856,7 +1852,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1959,7 +1955,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1977,7 +1973,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -2020,21 +2016,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif - //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2058,7 +2053,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 #if 1 @@ -2084,7 +2079,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -2131,30 +2126,30 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -2168,21 +2163,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -2190,10 +2185,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -2207,20 +2202,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 - + + // ymm4 ymm7 ymm10 ymm13 + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -2235,7 +2230,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -2243,57 +2238,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - - jmp(.DDONE) // jump to end. - - - - - label(.DBETAZERO) - - + + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -2309,7 +2304,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -2332,7 +2327,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index 8ac3612bd..d20058c5b 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -890,7 +890,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1587,7 +1587,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1946,7 +1946,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2191,7 +2191,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2331,8 +2331,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L label(.DPOSTPFETCH) mov(var(ps_a8), rdx) lea(mem(rax, rdx, 1), rdx) //rdx = a + ps_a8 //for prefetch - mov(var(ps_a8), rbp) - lea(mem(r11, rbp, 1), rbp) //rdx = a + ps_a8 //for prefetch + mov(var(ps_a8), rcx) + lea(mem(r11, rcx, 1), rcx) //rdx = a + ps_a8 //for prefetch mov(var(k_iter), rsi) test(rsi, rsi) je(.DCONSILEFT) @@ -2341,7 +2341,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L label(.DMAIN) //0 prefetch(0, mem(rdx, 5*8)) - prefetch(0, mem(rbp, 5*8)) + prefetch(0, mem(rcx, 5*8)) vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) @@ -2373,7 +2373,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L add(r9, rax) //1 prefetch(0, mem(rdx, r9, 1, 5*8)) - prefetch(0, mem(rbp, r9, 1, 5*8)) + prefetch(0, mem(rcx, r9, 1, 5*8)) vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) @@ -2405,7 +2405,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L add(r9, rax) //2 prefetch(0, mem(rdx, r9, 2, 5*8)) - prefetch(0, mem(rbp, r9, 2, 5*8)) + prefetch(0, mem(rcx, r9, 2, 5*8)) vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) @@ -2436,10 +2436,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L add(r10, rbx) add(r9, rax) //3 - prefetch(0, mem(rdx, rcx, 1, 5*8)) - prefetch(0, mem(rbp, rcx, 1, 5*8)) - lea(mem(rdx, r9, 4), rdx) - lea(mem(rbp, r9, 4), rbp) + lea(mem(rdx, r9, 2), rdx) + lea(mem(rcx, r9, 2), rcx) + prefetch(0, mem(rdx, r9, 1, 5*8)) + prefetch(0, mem(rcx, r9, 1, 5*8)) + lea(mem(rdx, r9, 2), rdx) + lea(mem(rcx, r9, 2), rcx) vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) @@ -2481,8 +2483,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L label(.DLEFT) prefetch(0, mem(rdx, 5*8)) - prefetch(0, mem(rbp, 5*8)) - add(r9, rbp) + prefetch(0, mem(rcx, 5*8)) + add(r9, rcx) add(r9, rdx) vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) @@ -2836,7 +2838,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3455,7 +3457,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -4084,7 +4086,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -4688,7 +4690,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -5214,7 +5216,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -5771,7 +5773,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -6327,7 +6329,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -6578,7 +6580,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -6953,7 +6955,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -7440,7 +7442,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -7615,9 +7617,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vmovupd(mem(rbx, 1*64), ymm0) add(r10, rbx) // b += rs_b; - lea(mem(rax, r13, 2), rbp) - vbroadcastsd(mem(rbp ), ymm2) - vbroadcastsd(mem(rbp, r8, 1), ymm3) + lea(mem(rax, r13, 2), r11) + vbroadcastsd(mem(r11 ), ymm2) + vbroadcastsd(mem(r11, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm12) vfmadd231pd(ymm1, ymm3, ymm14) @@ -7652,9 +7654,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vmovupd(mem(rbx, 1*64), ymm0) add(r10, rbx) // b += rs_b; - lea(mem(rax, r13, 2), rbp) - vbroadcastsd(mem(rbp ), ymm2) - vbroadcastsd(mem(rbp, r8, 1), ymm3) + lea(mem(rax, r13, 2), r11) + vbroadcastsd(mem(r11 ), ymm2) + vbroadcastsd(mem(r11, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm12) vfmadd231pd(ymm1, ymm3, ymm14) @@ -7689,9 +7691,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vmovupd(mem(rbx, 1*64), ymm0) add(r10, rbx) // b += rs_b; - lea(mem(rax, r13, 2), rbp) - vbroadcastsd(mem(rbp ), ymm2) - vbroadcastsd(mem(rbp, r8, 1), ymm3) + lea(mem(rax, r13, 2), r11) + vbroadcastsd(mem(r11 ), ymm2) + vbroadcastsd(mem(r11, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm12) vfmadd231pd(ymm1, ymm3, ymm14) add(r9, rax) // a += cs_a; @@ -7725,9 +7727,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vmovupd(mem(rbx, 1*64), ymm0) add(r10, rbx) // b += rs_b; - lea(mem(rax, r13, 2), rbp) - vbroadcastsd(mem(rbp ), ymm2) - vbroadcastsd(mem(rbp, r8, 1), ymm3) + lea(mem(rax, r13, 2), r11) + vbroadcastsd(mem(r11 ), ymm2) + vbroadcastsd(mem(r11, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm12) vfmadd231pd(ymm1, ymm3, ymm14) add(r9, rax) // a += cs_a; @@ -7771,9 +7773,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vmovupd(mem(rbx, 1*64), ymm0) add(r10, rbx) // b += rs_b; - lea(mem(rax, r13, 2), rbp) - vbroadcastsd(mem(rbp ), ymm2) - vbroadcastsd(mem(rbp, r8, 1), ymm3) + lea(mem(rax, r13, 2), r11) + vbroadcastsd(mem(r11 ), ymm2) + vbroadcastsd(mem(r11, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm12) vfmadd231pd(ymm1, ymm3, ymm14) add(r9, rax) // a += cs_a; @@ -7909,12 +7911,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - lea(mem(rcx, 6*8), rbp) - lea(mem(rbp, rsi, 2), rbp) - vfmadd231pd(mem(rbp ), xmm3, xmm2) - vfmadd231pd(mem(rbp, rsi, 1), xmm3, xmm4) - vmovlpd(xmm2, mem(rbp)) - vmovupd(xmm4, mem(rbp, rsi, 1)) + lea(mem(rcx, 6*8), r11) + lea(mem(r11, rsi, 2), r11) + vfmadd231pd(mem(r11 ), xmm3, xmm2) + vfmadd231pd(mem(r11, rsi, 1), xmm3, xmm4) + vmovlpd(xmm2, mem(r11)) + vmovupd(xmm4, mem(r11, rsi, 1)) lea(mem(rdx, rsi, 4), rdx) @@ -8022,11 +8024,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - lea(mem(rcx, rdi, 4), rbp) - lea(mem(rbp, rdi, 2), rbp) - lea(mem(rbp, rsi, 2), rbp) - vmovlpd(xmm2, mem(rbp)) - vmovupd(xmm4, mem(rbp, rsi, 1)) + lea(mem(rcx, rdi, 4), r11) + lea(mem(r11, rdi, 2), r11) + lea(mem(r11, rsi, 2), r11) + vmovlpd(xmm2, mem(r11)) + vmovupd(xmm4, mem(r11, rsi, 1)) lea(mem(rdx, rsi, 4), rdx) @@ -8079,7 +8081,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -8775,7 +8777,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -9433,7 +9435,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -10066,7 +10068,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c index a473ae33c..036338fbf 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -866,7 +866,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1616,7 +1616,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2275,7 +2275,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2955,7 +2955,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3516,7 +3516,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -4041,7 +4041,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c index 8d3900f2e..08869010b 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -564,7 +564,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -973,7 +973,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1347,7 +1347,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1686,7 +1686,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index f19b703b4..bfc90d79a 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -171,7 +171,6 @@ void bli_dgemmsup_rd_haswell_asm_6x4 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a @@ -583,7 +582,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c index 571444bed..03f956095 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -254,7 +254,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif - lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a @@ -674,7 +673,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1187,7 +1186,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c index eb1118196..1a64eb936 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -163,7 +163,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c @@ -544,7 +544,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -635,7 +635,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c @@ -994,7 +994,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1084,7 +1084,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c @@ -1402,7 +1402,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1491,7 +1491,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c @@ -1807,7 +1807,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1895,7 +1895,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c @@ -2157,7 +2157,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2244,7 +2244,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; + prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c @@ -2484,7 +2484,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index 9da1e7b83..7e7de3fdb 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -682,7 +682,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1257,7 +1257,7 @@ void bli_dgemmsup_rv_haswell_asm_5x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1755,7 +1755,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2270,7 +2270,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2701,7 +2701,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3078,7 +3078,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c index a6c8f0e43..49339445a 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -798,7 +798,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1393,7 +1393,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1901,7 +1901,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2435,7 +2435,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2858,7 +2858,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3248,7 +3248,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",