From e2e95a09b041fc3623820fa78c8d4eb321f5e2ff Mon Sep 17 00:00:00 2001 From: Hari Govind S Date: Tue, 30 Jul 2024 14:55:23 +0530 Subject: [PATCH] Fixing missing registers in end_asm for copyv APIs - Added the missing registers in end_asm for scopy, dcopy and zcopy APIs. - Removed unnecessary registers from end_asm for scopy and dcopy APIs. - Corrected mistakes in the comments. Change-Id: I5ebe2ff9cb2c72ca7c71a67419281f73462f9498 --- kernels/zen4/1/bli_copyv_zen4_asm_avx512.c | 38 +++++++++------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/kernels/zen4/1/bli_copyv_zen4_asm_avx512.c b/kernels/zen4/1/bli_copyv_zen4_asm_avx512.c index c5e6371fa..a3e7e4696 100644 --- a/kernels/zen4/1/bli_copyv_zen4_asm_avx512.c +++ b/kernels/zen4/1/bli_copyv_zen4_asm_avx512.c @@ -353,12 +353,9 @@ void bli_scopyv_zen4_asm_avx512 "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "zmm16", "zmm17", "zmm18", "zmm19", - "zmm20", "zmm21", "zmm22", "zmm23", - "zmm24", "zmm25", "zmm26", "zmm27", - "zmm28", "zmm29", "zmm30", "zmm31", - "rsi", "rdx", "rcx", "r8", - "r9", "r11" + "xmm0", "rsi", "rdx", "rcx", + "r8", "r9", "r11", "k2", + "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) @@ -697,12 +694,9 @@ void bli_dcopyv_zen4_asm_avx512 "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "zmm16", "zmm17", "zmm18", "zmm19", - "zmm20", "zmm21", "zmm22", "zmm23", - "zmm24", "zmm25", "zmm26", "zmm27", - "zmm28", "zmm29", "zmm30", "zmm31", - "rsi", "rdx", "rcx", "r8", - "r9", "r11" + "rsi", "rdi", "rcx", "r8", + "r9", "r11", "k2", "xmm0", + "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) @@ -996,10 +990,10 @@ void bli_zcopyv_zen4_asm_avx512 /* Creating mask: Example - fringe case = 1 - step 1 : rdx_o = (1111 1111)2 or (255)10 - step 2 : rdx_o = (1111 1110)2 or (254)10 - step 3 : rdx_o = (1111 1100)2 or (252)10 - step 4 : rdx_o = (0000 0011)2 or (3)10 + step 1 : rcx = (1111 1111)2 or (255)10 + step 2 : rcx = (1111 1110)2 or (254)10 + step 3 : rcx = (1111 1100)2 or (252)10 + step 4 : rcx = (0000 0011)2 or (3)10 */ // Loading the input values using masked load vmovupd(mem(rdx, 0*64), zmm0 MASK_(K(2))) @@ -1027,7 +1021,7 @@ void bli_zcopyv_zen4_asm_avx512 "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "rsi", "rdx", "rcx", - "r8", "r9" + "r8", "r9", "k2", "memory" ) } else @@ -1504,10 +1498,10 @@ void bli_zcopyv_zen4_asm_avx512 /* Creating mask: Example - fringe case = 1 - step 1 : rdx_o = (1111 1111)2 or (255)10 - step 2 : rdx_o = (1111 1110)2 or (254)10 - step 3 : rdx_o = (1111 1100)2 or (252)10 - step 4 : rdx_o = (0000 0011)2 or (3)10 + step 1 : rcx = (1111 1111)2 or (255)10 + step 2 : rcx = (1111 1110)2 or (254)10 + step 3 : rcx = (1111 1100)2 or (252)10 + step 4 : rcx = (0000 0011)2 or (3)10 */ // Loading the input values using masked load vmovupd(mem(rdx, 0*64), zmm0 MASK_(K(2))) @@ -1536,7 +1530,7 @@ void bli_zcopyv_zen4_asm_avx512 "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rsi", "rdx", "rcx", "r8", - "r9" + "r9", "k2", "memory" ) } else