Fixed more Xeon Phi bugs, especially with scattered update

This commit is contained in:
Tyler Smith
2014-02-14 13:40:24 -06:00
parent 31134b5c70
commit ce06686368
5 changed files with 34 additions and 31 deletions

View File

@@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate
# --- Determine the C compiler and related flags ---
CC := icc
CPPROCFLAGS :=
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
CMISCFLAGS := -mmic -fasm-blocks -std=c99
CDBGFLAGS :=
CWARNFLAGS := -Wall
COPTFLAGS := -O3
@@ -98,7 +98,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS := -lm
LDFLAGS := -mmic -lm

View File

@@ -275,8 +275,8 @@ void bli_dgemm_opt_30x8(
auxinfo_t* data
)
{
double * a_next = bli_auxinfo_next_a( aux );
double * b_next = bli_auxinfo_next_b( aux );
double * a_next = bli_auxinfo_next_a( data );
double * b_next = bli_auxinfo_next_b( data );
int * offsetPtr = &offsets[0];
@@ -364,7 +364,7 @@ void bli_dgemm_opt_30x8(
LOOPMAIN:
ONE_ITER_MAIN_LOOP(rcx, rsi)
jne LOOPMAIN
//Penultimate 22 iterations.
//Break these off from the main loop to avoid prefetching extra shit.
mov r14, a_next
@@ -398,20 +398,24 @@ void bli_dgemm_opt_30x8(
POSTACCUM:
// jmp END
#ifdef MONITORS
rdtsc
mov mid2l, eax
mov mid2h, edx
#endif
mov r9, c //load address of c for update
mov r12, alpha //load address of alpha
// Check if C is row stride. If not, jump to the slow scattered update
mov r14, cs_c
dec r14
jne SCATTEREDUPDATE
mov r14, beta
vbroadcastsd zmm31, 0[r14]
mov r9, c //load address of c for update
mov r12, alpha //load address of alpha
vmulpd zmm0, zmm0, 0[r12]{1to8}
vmulpd zmm1, zmm1, 0[r12]{1to8}
@@ -526,7 +530,7 @@ void bli_dgemm_opt_30x8(
vpbroadcastd zmm30, cs_c
mov r13, beta
vpmulld zmm30, zmm31, zmm30
mov ebx, 255
UPDATE_C_ROW_SCATTERED(zmm0, 0, r9)
UPDATE_C_ROW_SCATTERED(zmm1, 1, r9)

View File

@@ -45,8 +45,7 @@ void PASTEMAC(ch,varname)( \
ctype* b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data, \
dim_t thread_id \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_opt_30x8 )

View File

@@ -9,7 +9,7 @@
#
1 # Number of repeats per experiment (best result is reported)
c # Matrix storage scheme(s) to test:
r # Matrix storage scheme(s) to test:
# 'c' = col-major storage; 'g' = general stride storage;
# 'r' = row-major storage
c # Vector storage scheme(s) to test:
@@ -17,12 +17,12 @@ c # Vector storage scheme(s) to test:
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
sdcz # Datatype(s) to test:
d # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
100 # Problem size: first to test
300 # Problem size: maximum to test
100 # Problem size: increment between experiments
120 # Problem size: first to test
480 # Problem size: maximum to test
120 # Problem size: increment between experiments
1 # Error-checking level:
# '0' = disable error checking; '1' = full error checking
i # Reaction to test failure:

View File

@@ -77,12 +77,12 @@
# --- Section overrides ----------------------------------------------------
1 # Utility
1 # Level-1v
1 # Level-1m
1 # Level-1f kernels
1 # Level-2
1 # Level-3 micro-kernels
0 # Utility
0 # Level-1v
0 # Level-1m
0 # Level-1f kernels
0 # Level-2
0 # Level-3 micro-kernels
1 # Level-3
@@ -293,47 +293,47 @@
-1 -1 -2 # dimensions: m n k
?? # parameters: transa transb
1 # hemm
0 # hemm
1 # test sequential front-end
-1 -2 # dimensions: m n
???? # parameters: side uploa conja transb
1 # herk
0 # herk
1 # test sequential front-end
-1 -2 # dimensions: m k
?? # parameters: uploc transa
1 # her2k
0 # her2k
1 # test sequential front-end
-1 -2 # dimensions: m k
??? # parameters: uploc transa transb
1 # symm
0 # symm
1 # test sequential front-end
-1 -2 # dimensions: m n
???? # parameters: side uploa conja transb
1 # syrk
0 # syrk
1 # test sequential front-end
-1 -2 # dimensions: m k
?? # parameters: uploc transa
1 # syr2k
0 # syr2k
1 # test sequential front-end
-1 -2 # dimensions: m k
??? # parameters: uploc transa transb
1 # trmm
0 # trmm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa transa diaga
1 # trmm3
0 # trmm3
1 # test sequential front-end
-1 -1 # dimensions: m n
????? # parameters: side uploa transa diaga transb
1 # trsm
0 # trsm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa transa diaga