mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed more Xeon Phi bugs, especially with scattered update
This commit is contained in:
@@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := icc
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99
|
||||
CDBGFLAGS :=
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3
|
||||
@@ -98,7 +98,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -lm
|
||||
LDFLAGS := -mmic -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -275,8 +275,8 @@ void bli_dgemm_opt_30x8(
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
double * a_next = bli_auxinfo_next_a( aux );
|
||||
double * b_next = bli_auxinfo_next_b( aux );
|
||||
double * a_next = bli_auxinfo_next_a( data );
|
||||
double * b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
int * offsetPtr = &offsets[0];
|
||||
|
||||
@@ -364,7 +364,7 @@ void bli_dgemm_opt_30x8(
|
||||
LOOPMAIN:
|
||||
ONE_ITER_MAIN_LOOP(rcx, rsi)
|
||||
jne LOOPMAIN
|
||||
|
||||
|
||||
//Penultimate 22 iterations.
|
||||
//Break these off from the main loop to avoid prefetching extra shit.
|
||||
mov r14, a_next
|
||||
@@ -398,20 +398,24 @@ void bli_dgemm_opt_30x8(
|
||||
|
||||
|
||||
POSTACCUM:
|
||||
// jmp END
|
||||
|
||||
#ifdef MONITORS
|
||||
rdtsc
|
||||
mov mid2l, eax
|
||||
mov mid2h, edx
|
||||
#endif
|
||||
|
||||
mov r9, c //load address of c for update
|
||||
mov r12, alpha //load address of alpha
|
||||
|
||||
// Check if C is row stride. If not, jump to the slow scattered update
|
||||
mov r14, cs_c
|
||||
dec r14
|
||||
jne SCATTEREDUPDATE
|
||||
|
||||
mov r14, beta
|
||||
vbroadcastsd zmm31, 0[r14]
|
||||
mov r9, c //load address of c for update
|
||||
mov r12, alpha //load address of alpha
|
||||
|
||||
|
||||
vmulpd zmm0, zmm0, 0[r12]{1to8}
|
||||
vmulpd zmm1, zmm1, 0[r12]{1to8}
|
||||
@@ -526,7 +530,7 @@ void bli_dgemm_opt_30x8(
|
||||
vpbroadcastd zmm30, cs_c
|
||||
mov r13, beta
|
||||
vpmulld zmm30, zmm31, zmm30
|
||||
|
||||
|
||||
mov ebx, 255
|
||||
UPDATE_C_ROW_SCATTERED(zmm0, 0, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm1, 1, r9)
|
||||
|
||||
@@ -45,8 +45,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data, \
|
||||
dim_t thread_id \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_30x8 )
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#
|
||||
|
||||
1 # Number of repeats per experiment (best result is reported)
|
||||
c # Matrix storage scheme(s) to test:
|
||||
r # Matrix storage scheme(s) to test:
|
||||
# 'c' = col-major storage; 'g' = general stride storage;
|
||||
# 'r' = row-major storage
|
||||
c # Vector storage scheme(s) to test:
|
||||
@@ -17,12 +17,12 @@ c # Vector storage scheme(s) to test:
|
||||
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
|
||||
0 # Test all combinations of storage schemes?
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
sdcz # Datatype(s) to test:
|
||||
d # Datatype(s) to test:
|
||||
# 's' = single real; 'c' = single complex;
|
||||
# 'd' = double real; 'z' = double complex
|
||||
100 # Problem size: first to test
|
||||
300 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
120 # Problem size: first to test
|
||||
480 # Problem size: maximum to test
|
||||
120 # Problem size: increment between experiments
|
||||
1 # Error-checking level:
|
||||
# '0' = disable error checking; '1' = full error checking
|
||||
i # Reaction to test failure:
|
||||
|
||||
@@ -77,12 +77,12 @@
|
||||
|
||||
# --- Section overrides ----------------------------------------------------
|
||||
|
||||
1 # Utility
|
||||
1 # Level-1v
|
||||
1 # Level-1m
|
||||
1 # Level-1f kernels
|
||||
1 # Level-2
|
||||
1 # Level-3 micro-kernels
|
||||
0 # Utility
|
||||
0 # Level-1v
|
||||
0 # Level-1m
|
||||
0 # Level-1f kernels
|
||||
0 # Level-2
|
||||
0 # Level-3 micro-kernels
|
||||
1 # Level-3
|
||||
|
||||
|
||||
@@ -293,47 +293,47 @@
|
||||
-1 -1 -2 # dimensions: m n k
|
||||
?? # parameters: transa transb
|
||||
|
||||
1 # hemm
|
||||
0 # hemm
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m n
|
||||
???? # parameters: side uploa conja transb
|
||||
|
||||
1 # herk
|
||||
0 # herk
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m k
|
||||
?? # parameters: uploc transa
|
||||
|
||||
1 # her2k
|
||||
0 # her2k
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m k
|
||||
??? # parameters: uploc transa transb
|
||||
|
||||
1 # symm
|
||||
0 # symm
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m n
|
||||
???? # parameters: side uploa conja transb
|
||||
|
||||
1 # syrk
|
||||
0 # syrk
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m k
|
||||
?? # parameters: uploc transa
|
||||
|
||||
1 # syr2k
|
||||
0 # syr2k
|
||||
1 # test sequential front-end
|
||||
-1 -2 # dimensions: m k
|
||||
??? # parameters: uploc transa transb
|
||||
|
||||
1 # trmm
|
||||
0 # trmm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa transa diaga
|
||||
|
||||
1 # trmm3
|
||||
0 # trmm3
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
????? # parameters: side uploa transa diaga transb
|
||||
|
||||
1 # trsm
|
||||
0 # trsm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa transa diaga
|
||||
|
||||
Reference in New Issue
Block a user