Fixed ASAN memory issue due to modifying RBP register

- RBP is base pointer which points to base of current stack frame.
  ASAN tool rely on rbp and rsp for stack related validations. So over-writting
  or modifying RBP register results in application termination with the error code
  of stack overflow.
- Removed all the code snippets which were using rbp register for prefetching matrices
  and sometimes loading elements from memory in all of the gemm sup kernels for double
  datatype.
- Removed reference to rbp from register clobber list as well to completely avoid the
  usage of rbp register.

AMD-Internal: [CPUPL-2613, CPUPL-2587]

Change-Id: Idd402d3c644c4dd66e8d4988aede539ad8c77b28
This commit is contained in:
Harsh Dave
2023-03-20 14:35:02 -05:00
parent dfc95d29fc
commit 238d9fda9e
10 changed files with 780 additions and 787 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 22, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -890,7 +890,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1587,7 +1587,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1946,7 +1946,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2191,7 +2191,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2331,8 +2331,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
label(.DPOSTPFETCH)
mov(var(ps_a8), rdx)
lea(mem(rax, rdx, 1), rdx) //rdx = a + ps_a8 //for prefetch
mov(var(ps_a8), rbp)
lea(mem(r11, rbp, 1), rbp) //rdx = a + ps_a8 //for prefetch
mov(var(ps_a8), rcx)
lea(mem(r11, rcx, 1), rcx) //rdx = a + ps_a8 //for prefetch
mov(var(k_iter), rsi)
test(rsi, rsi)
je(.DCONSILEFT)
@@ -2341,7 +2341,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
label(.DMAIN)
//0
prefetch(0, mem(rdx, 5*8))
prefetch(0, mem(rbp, 5*8))
prefetch(0, mem(rcx, 5*8))
vmovupd(mem(rbx, 0*32), ymm0)
vmovupd(mem(rbx, 1*32), ymm1)
@@ -2373,7 +2373,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
add(r9, rax)
//1
prefetch(0, mem(rdx, r9, 1, 5*8))
prefetch(0, mem(rbp, r9, 1, 5*8))
prefetch(0, mem(rcx, r9, 1, 5*8))
vmovupd(mem(rbx, 0*32), ymm0)
vmovupd(mem(rbx, 1*32), ymm1)
@@ -2405,7 +2405,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
add(r9, rax)
//2
prefetch(0, mem(rdx, r9, 2, 5*8))
prefetch(0, mem(rbp, r9, 2, 5*8))
prefetch(0, mem(rcx, r9, 2, 5*8))
vmovupd(mem(rbx, 0*32), ymm0)
vmovupd(mem(rbx, 1*32), ymm1)
@@ -2436,10 +2436,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
add(r10, rbx)
add(r9, rax)
//3
prefetch(0, mem(rdx, rcx, 1, 5*8))
prefetch(0, mem(rbp, rcx, 1, 5*8))
lea(mem(rdx, r9, 4), rdx)
lea(mem(rbp, r9, 4), rbp)
lea(mem(rdx, r9, 2), rdx)
lea(mem(rcx, r9, 2), rcx)
prefetch(0, mem(rdx, r9, 1, 5*8))
prefetch(0, mem(rcx, r9, 1, 5*8))
lea(mem(rdx, r9, 2), rdx)
lea(mem(rcx, r9, 2), rcx)
vmovupd(mem(rbx, 0*32), ymm0)
vmovupd(mem(rbx, 1*32), ymm1)
@@ -2481,8 +2483,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
label(.DLEFT)
prefetch(0, mem(rdx, 5*8))
prefetch(0, mem(rbp, 5*8))
add(r9, rbp)
prefetch(0, mem(rcx, 5*8))
add(r9, rcx)
add(r9, rdx)
vmovupd(mem(rbx, 0*32), ymm0)
vmovupd(mem(rbx, 1*32), ymm1)
@@ -2836,7 +2838,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -3455,7 +3457,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -4084,7 +4086,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -4688,7 +4690,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -5214,7 +5216,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -5771,7 +5773,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -6327,7 +6329,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -6578,7 +6580,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -6953,7 +6955,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -7440,7 +7442,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -7615,9 +7617,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vmovupd(mem(rbx, 1*64), ymm0)
add(r10, rbx) // b += rs_b;
lea(mem(rax, r13, 2), rbp)
vbroadcastsd(mem(rbp ), ymm2)
vbroadcastsd(mem(rbp, r8, 1), ymm3)
lea(mem(rax, r13, 2), r11)
vbroadcastsd(mem(r11 ), ymm2)
vbroadcastsd(mem(r11, r8, 1), ymm3)
vfmadd231pd(ymm1, ymm2, ymm12)
vfmadd231pd(ymm1, ymm3, ymm14)
@@ -7652,9 +7654,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vmovupd(mem(rbx, 1*64), ymm0)
add(r10, rbx) // b += rs_b;
lea(mem(rax, r13, 2), rbp)
vbroadcastsd(mem(rbp ), ymm2)
vbroadcastsd(mem(rbp, r8, 1), ymm3)
lea(mem(rax, r13, 2), r11)
vbroadcastsd(mem(r11 ), ymm2)
vbroadcastsd(mem(r11, r8, 1), ymm3)
vfmadd231pd(ymm1, ymm2, ymm12)
vfmadd231pd(ymm1, ymm3, ymm14)
@@ -7689,9 +7691,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vmovupd(mem(rbx, 1*64), ymm0)
add(r10, rbx) // b += rs_b;
lea(mem(rax, r13, 2), rbp)
vbroadcastsd(mem(rbp ), ymm2)
vbroadcastsd(mem(rbp, r8, 1), ymm3)
lea(mem(rax, r13, 2), r11)
vbroadcastsd(mem(r11 ), ymm2)
vbroadcastsd(mem(r11, r8, 1), ymm3)
vfmadd231pd(ymm1, ymm2, ymm12)
vfmadd231pd(ymm1, ymm3, ymm14)
add(r9, rax) // a += cs_a;
@@ -7725,9 +7727,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vmovupd(mem(rbx, 1*64), ymm0)
add(r10, rbx) // b += rs_b;
lea(mem(rax, r13, 2), rbp)
vbroadcastsd(mem(rbp ), ymm2)
vbroadcastsd(mem(rbp, r8, 1), ymm3)
lea(mem(rax, r13, 2), r11)
vbroadcastsd(mem(r11 ), ymm2)
vbroadcastsd(mem(r11, r8, 1), ymm3)
vfmadd231pd(ymm1, ymm2, ymm12)
vfmadd231pd(ymm1, ymm3, ymm14)
add(r9, rax) // a += cs_a;
@@ -7771,9 +7773,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vmovupd(mem(rbx, 1*64), ymm0)
add(r10, rbx) // b += rs_b;
lea(mem(rax, r13, 2), rbp)
vbroadcastsd(mem(rbp ), ymm2)
vbroadcastsd(mem(rbp, r8, 1), ymm3)
lea(mem(rax, r13, 2), r11)
vbroadcastsd(mem(r11 ), ymm2)
vbroadcastsd(mem(r11, r8, 1), ymm3)
vfmadd231pd(ymm1, ymm2, ymm12)
vfmadd231pd(ymm1, ymm3, ymm14)
add(r9, rax) // a += cs_a;
@@ -7909,12 +7911,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vunpckhpd(ymm14, ymm12, ymm1)
vextractf128(imm(0x1), ymm0, xmm2)
vextractf128(imm(0x1), ymm1, xmm4)
lea(mem(rcx, 6*8), rbp)
lea(mem(rbp, rsi, 2), rbp)
vfmadd231pd(mem(rbp ), xmm3, xmm2)
vfmadd231pd(mem(rbp, rsi, 1), xmm3, xmm4)
vmovlpd(xmm2, mem(rbp))
vmovupd(xmm4, mem(rbp, rsi, 1))
lea(mem(rcx, 6*8), r11)
lea(mem(r11, rsi, 2), r11)
vfmadd231pd(mem(r11 ), xmm3, xmm2)
vfmadd231pd(mem(r11, rsi, 1), xmm3, xmm4)
vmovlpd(xmm2, mem(r11))
vmovupd(xmm4, mem(r11, rsi, 1))
lea(mem(rdx, rsi, 4), rdx)
@@ -8022,11 +8024,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
vunpckhpd(ymm14, ymm12, ymm1)
vextractf128(imm(0x1), ymm0, xmm2)
vextractf128(imm(0x1), ymm1, xmm4)
lea(mem(rcx, rdi, 4), rbp)
lea(mem(rbp, rdi, 2), rbp)
lea(mem(rbp, rsi, 2), rbp)
vmovlpd(xmm2, mem(rbp))
vmovupd(xmm4, mem(rbp, rsi, 1))
lea(mem(rcx, rdi, 4), r11)
lea(mem(r11, rdi, 2), r11)
lea(mem(r11, rsi, 2), r11)
vmovlpd(xmm2, mem(r11))
vmovupd(xmm4, mem(r11, rsi, 1))
lea(mem(rdx, rsi, 4), rdx)
@@ -8079,7 +8081,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -8775,7 +8777,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -9433,7 +9435,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -10066,7 +10068,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -866,7 +866,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1616,7 +1616,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2275,7 +2275,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2955,7 +2955,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -3516,7 +3516,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -4041,7 +4041,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -564,7 +564,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -973,7 +973,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1347,7 +1347,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1686,7 +1686,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -171,7 +171,6 @@ void bli_dgemmsup_rd_haswell_asm_6x4
prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
#endif
lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a
@@ -583,7 +582,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -254,7 +254,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8
prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
#endif
lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a
@@ -674,7 +673,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1187,7 +1186,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -163,7 +163,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
@@ -544,7 +544,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -635,7 +635,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
@@ -994,7 +994,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1084,7 +1084,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
@@ -1402,7 +1402,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1491,7 +1491,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
@@ -1807,7 +1807,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1895,7 +1895,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
@@ -2157,7 +2157,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2244,7 +2244,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
mov(var(cs_c), rsi) // load cs_c to rsi (temporarily)
lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
//lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c;
prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c
prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
@@ -2484,7 +2484,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -682,7 +682,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1257,7 +1257,7 @@ void bli_dgemmsup_rv_haswell_asm_5x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1755,7 +1755,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2270,7 +2270,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2701,7 +2701,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -3078,7 +3078,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -798,7 +798,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1393,7 +1393,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -1901,7 +1901,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2435,7 +2435,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -2858,7 +2858,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
@@ -3248,7 +3248,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
[a_next] "m" (a_next),
[b_next] "m" (b_next)*/
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",