From ce066863683cb4e910270cf8ab8e138b01ff3358 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 14 Feb 2014 13:40:24 -0600 Subject: [PATCH] Fixed more Xeon Phi bugs, especially with scattered update --- config/mic/make_defs.mk | 4 ++-- kernels/mic/3/bli_gemm_opt_30x8.c | 18 +++++++++++------- kernels/mic/3/bli_gemm_opt_30x8.h | 3 +-- testsuite/input.general | 10 +++++----- testsuite/input.operations | 30 +++++++++++++++--------------- 5 files changed, 34 insertions(+), 31 deletions(-) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 23d5d9096..c894bc638 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate # --- Determine the C compiler and related flags --- CC := icc CPPROCFLAGS := -CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp +CMISCFLAGS := -mmic -fasm-blocks -std=c99 CDBGFLAGS := CWARNFLAGS := -Wall COPTFLAGS := -O3 @@ -98,7 +98,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -lm +LDFLAGS := -mmic -lm diff --git a/kernels/mic/3/bli_gemm_opt_30x8.c b/kernels/mic/3/bli_gemm_opt_30x8.c index 71e77f9b3..c4d0ff11a 100644 --- a/kernels/mic/3/bli_gemm_opt_30x8.c +++ b/kernels/mic/3/bli_gemm_opt_30x8.c @@ -275,8 +275,8 @@ void bli_dgemm_opt_30x8( auxinfo_t* data ) { - double * a_next = bli_auxinfo_next_a( aux ); - double * b_next = bli_auxinfo_next_b( aux ); + double * a_next = bli_auxinfo_next_a( data ); + double * b_next = bli_auxinfo_next_b( data ); int * offsetPtr = &offsets[0]; @@ -364,7 +364,7 @@ void bli_dgemm_opt_30x8( LOOPMAIN: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN - + //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. mov r14, a_next @@ -398,20 +398,24 @@ void bli_dgemm_opt_30x8( POSTACCUM: - // jmp END + #ifdef MONITORS rdtsc mov mid2l, eax mov mid2h, edx #endif + + mov r9, c //load address of c for update + mov r12, alpha //load address of alpha + + // Check if C is row stride. If not, jump to the slow scattered update mov r14, cs_c dec r14 jne SCATTEREDUPDATE mov r14, beta vbroadcastsd zmm31, 0[r14] - mov r9, c //load address of c for update - mov r12, alpha //load address of alpha + vmulpd zmm0, zmm0, 0[r12]{1to8} vmulpd zmm1, zmm1, 0[r12]{1to8} @@ -526,7 +530,7 @@ void bli_dgemm_opt_30x8( vpbroadcastd zmm30, cs_c mov r13, beta vpmulld zmm30, zmm31, zmm30 - + mov ebx, 255 UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) diff --git a/kernels/mic/3/bli_gemm_opt_30x8.h b/kernels/mic/3/bli_gemm_opt_30x8.h index 3f27b767f..76e0e2d8e 100644 --- a/kernels/mic/3/bli_gemm_opt_30x8.h +++ b/kernels/mic/3/bli_gemm_opt_30x8.h @@ -45,8 +45,7 @@ void PASTEMAC(ch,varname)( \ ctype* b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - dim_t thread_id \ + auxinfo_t* data \ ); INSERT_GENTPROT_BASIC( gemm_opt_30x8 ) diff --git a/testsuite/input.general b/testsuite/input.general index 0f52b46c7..85957f457 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -9,7 +9,7 @@ # 1 # Number of repeats per experiment (best result is reported) -c # Matrix storage scheme(s) to test: +r # Matrix storage scheme(s) to test: # 'c' = col-major storage; 'g' = general stride storage; # 'r' = row-major storage c # Vector storage scheme(s) to test: @@ -17,12 +17,12 @@ c # Vector storage scheme(s) to test: # 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride 0 # Test all combinations of storage schemes? 32 # General stride spacing (for cases when testing general stride) -sdcz # Datatype(s) to test: +d # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex -100 # Problem size: first to test -300 # Problem size: maximum to test -100 # Problem size: increment between experiments +120 # Problem size: first to test +480 # Problem size: maximum to test +120 # Problem size: increment between experiments 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/input.operations b/testsuite/input.operations index 6a508c814..bfa9020ca 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -77,12 +77,12 @@ # --- Section overrides ---------------------------------------------------- -1 # Utility -1 # Level-1v -1 # Level-1m -1 # Level-1f kernels -1 # Level-2 -1 # Level-3 micro-kernels +0 # Utility +0 # Level-1v +0 # Level-1m +0 # Level-1f kernels +0 # Level-2 +0 # Level-3 micro-kernels 1 # Level-3 @@ -293,47 +293,47 @@ -1 -1 -2 # dimensions: m n k ?? # parameters: transa transb -1 # hemm +0 # hemm 1 # test sequential front-end -1 -2 # dimensions: m n ???? # parameters: side uploa conja transb -1 # herk +0 # herk 1 # test sequential front-end -1 -2 # dimensions: m k ?? # parameters: uploc transa -1 # her2k +0 # her2k 1 # test sequential front-end -1 -2 # dimensions: m k ??? # parameters: uploc transa transb -1 # symm +0 # symm 1 # test sequential front-end -1 -2 # dimensions: m n ???? # parameters: side uploa conja transb -1 # syrk +0 # syrk 1 # test sequential front-end -1 -2 # dimensions: m k ?? # parameters: uploc transa -1 # syr2k +0 # syr2k 1 # test sequential front-end -1 -2 # dimensions: m k ??? # parameters: uploc transa transb -1 # trmm +0 # trmm 1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa transa diaga -1 # trmm3 +0 # trmm3 1 # test sequential front-end -1 -1 # dimensions: m n ????? # parameters: side uploa transa diaga transb -1 # trsm +0 # trsm 1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa transa diaga