diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 2bb912680..ff995b5f0 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -712,7 +712,12 @@ void zgemm_ //dim_t nt = bli_thread_get_num_threads(); // get number of threads bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. - if ( nt ) +#ifdef AOCL_DYNAMIC + //For smaller sizes zgemm_small is perfoming better + if (nt && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>100)) ) +#else + if (nt) +#endif { // Will call parallelized zgemm code - sup & native PASTEMAC(gemm, BLIS_OAPI_EX_SUF) @@ -733,6 +738,31 @@ void zgemm_ return; } +#ifdef BLIS_ENABLE_SMALL_MATRIX + err_t status; + + if((nt == 0) && (m0 <= 512 ) && ( n0 <= 512 ) && ( k0 <= 512 )) + { + status = bli_gemm_small( &alphao, + &ao, + &bo, + &betao, + &co, + NULL, //cntx, + NULL + ); + } + + if (status == BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + + return; + } +#endif // The code below will be called when number of threads = 1. #if ENABLE_INDUCED_METHOD /* 3m_sqp is optimal for certain matrix shapes. @@ -769,13 +799,13 @@ void zgemm_ // sup has been enabled for single instance cases. if(single_instance==1) { - err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - if(status==BLIS_SUCCESS) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - return; - } + err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + if(status==BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + return; + } } // fall back on native path when zgemm is not handled in sup path. diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 3e9463fab..e0c84eb22 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -40,6 +40,7 @@ #define MR 32 #define D_MR (MR >> 1) +#define Z_MR (MR >> 3) #define NR 3 #define D_BLIS_SMALL_MATRIX_K_THRES_ROME 256 @@ -70,7 +71,26 @@ err_t bli_dgemm_small cntx_t* cntx, cntl_t* cntl ); - +static err_t bli_zgemm_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); +static err_t bli_zgemm_small_At + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); static err_t bli_sgemm_small_atbn ( obj_t* alpha, @@ -112,7 +132,7 @@ err_t bli_gemm_small #ifdef BLIS_ENABLE_MULTITHREADING AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); - return BLIS_NOT_YET_IMPLEMENTED; + return BLIS_NOT_YET_IMPLEMENTED; #else // This function is invoked on all architectures including ‘generic’. // Non-AVX platforms will use the kernels derived from the context. @@ -152,6 +172,18 @@ err_t bli_gemm_small return bli_dgemm_small_At(alpha, a, b, beta, c, cntx, cntl); #endif } + if(dt == BLIS_DCOMPLEX) + { +#ifndef BLIS_ENABLE_MULTITHREADING + // bli_zgemm_small_At is called directly from blas interface for + // sizes within thresholds. + // Avoinding calling of bli_zgemm_small_At from gemm_front + // and directing to native implementation. + return BLIS_NOT_YET_IMPLEMENTED; +#else + return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl); +#endif + } if (bli_obj_has_notrans( b )) { @@ -180,6 +212,19 @@ err_t bli_gemm_small #endif } + if (dt == BLIS_DCOMPLEX) + { +#ifndef BLIS_ENABLE_MULTITHREADING + // bli_zgemm_small is called directly from BLAS interface for sizes within thresholds. + // Avoiding calling bli_zgemm_small from gemm_front and directing to + // native implementation. + return BLIS_NOT_YET_IMPLEMENTED; +#else + return bli_zgemm_small(alpha, a, b, beta, c, cntx, cntl); +#endif + } + + if (dt == BLIS_FLOAT) { return bli_sgemm_small(alpha, a, b, beta, c, cntx, cntl); @@ -189,7 +234,6 @@ err_t bli_gemm_small return BLIS_NOT_YET_IMPLEMENTED; }; - static err_t bli_sgemm_small ( obj_t* alpha, @@ -2865,7 +2909,6 @@ static err_t bli_sgemm_small if (m_remainder >= 4) { - //printf("HERE\n"); m_remainder -= 4; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) @@ -5377,7 +5420,6 @@ err_t bli_dgemm_small_At if (m_remainder >= 4) { - //printf("HERE\n"); m_remainder -= 4; tA = A + row_idx * lda; @@ -5705,5 +5747,7679 @@ err_t bli_dgemm_small_At return BLIS_NONCONFORMAL_DIMENSIONS; } }; -#endif + +#define BLIS_SET_YMM_REG_ZEROS \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + ymm16 = _mm256_setzero_pd(); \ + ymm17 = _mm256_setzero_pd(); \ + ymm18 = _mm256_setzero_pd(); \ + ymm19 = _mm256_setzero_pd(); \ + ymm20 = _mm256_setzero_pd(); \ + ymm21 = _mm256_setzero_pd(); \ + + +#define BLIS_SET_ALL_YMM_REG_ZEROS \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm8 = _mm256_setzero_pd(); \ + ymm9 = _mm256_setzero_pd(); \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + ymm12 = _mm256_setzero_pd(); \ + ymm13 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + + + +static err_t bli_zgemm_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); + + bool conjtransa = bli_obj_has_conj(a); + bool conjtransb = bli_obj_has_conj(b); + + gint_t M = bli_obj_length( c ); // number of rows of Matrix C + gint_t N = bli_obj_width( c ); // number of columns of Matrix C + // number of columns of OP(A), will be updated if OP(A) is Transpose(A) + gint_t K = bli_obj_width( a ); + gint_t L = M * N; + + if(L && K ) + { + guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A). + guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B). + guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C + guint_t row_idx, col_idx, k; + dcomplex *A = bli_obj_buffer_at_off(a); //pointer to elements of Matrix A + dcomplex *B = bli_obj_buffer_at_off(b); //pointer to elements of Matrix B + dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C + + dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack; + dcomplex *tA_packed; //temprorary pointer to hold packed A memory pointer + guint_t row_idx_packed; //packed A memory row index + guint_t lda_packed; //lda of packed A + guint_t col_idx_start; //starting index after A matrix is packed. + dim_t tb_inc_row = 1; // row stride of matrix B + dim_t tb_inc_col = ldb; // column stride of matrix B + __m256d ymm4, ymm5, ymm6, ymm7; + __m256d ymm8, ymm9, ymm10, ymm11; + __m256d ymm12, ymm13, ymm14, ymm15; + __m256d ymm16, ymm17, ymm18, ymm19, ymm20, ymm21; + __m256d ymm0, ymm1, ymm2, ymm3; + + gint_t n_remainder; // If the N is non multiple of 3.(N%3) + gint_t m_remainder; // If the M is non multiple of 4.(M%4) + + dcomplex *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = bli_obj_buffer_for_1x1(BLIS_DCOMPLEX, alpha); + beta_cast = bli_obj_buffer_for_1x1(BLIS_DCOMPLEX, beta); + + gint_t required_packing_A = 1; + mem_t local_mem_buf_A_s; + dcomplex *D_A_pack = NULL; + rntm_t rntm; + + //update the pointer math if matrix B needs to be transposed. + if (bli_obj_has_trans( b )) + { + tb_inc_col = 1; //switch row and column strides + tb_inc_row = ldb; + } + + //checking whether beta value is zero. + //if true, we should perform C=alpha * A*B operation + //instead of C = beta * C + alpha * (A * B) + bool is_beta_non_zero = 0; + if(!bli_obj_equals(beta, &BLIS_ZERO)) + is_beta_non_zero = 1; + + /* + * This function was using global array to pack part of A input when + * needed. However, using this global array make the function + * non-reentrant. Instead of using a global array we should allocate + * buffer for each invocation. Since the buffer size is too big or stack + * and doing malloc every time will be too expensive, better approach is + * to get the buffer from the pre-allocated pool and it the pool once we + * are doing. + * + * In order to get the buffer from pool, we need access to memory broker, + * currently this function is not invoked in such a way that it can + * receive the memory broker (via rntm). Following hack will get the + * global memory broker that can be use it to access the pool. + * + * Note there will be memory allocation at least on first innovation + * as there will not be any pool created for this size. + * Subsequent invocations will just reuse the buffer from the pool. + */ + + bli_rntm_init_from_global( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + // Get the current size of the buffer pool for A block packing. + // We will use the same size to avoid pool re-initliazaton + siz_t buffer_size = bli_pool_block_size( + bli_membrk_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), + bli_rntm_membrk(&rntm))); + + // + // This kernel assumes that "A" will be unpackged if N <= 3. + // Usually this range (N <= 3) is handled by SUP, however, + // if SUP is disabled or for any other condition if we do + // enter this kernel with N <= 3, we want to make sure that + // "A" remains unpacked. + // + + if ((N < 3) || ((Z_MR * K) << 3) > buffer_size) + { + required_packing_A = 0; + } + + if (required_packing_A == 1) + { +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_zgemm_small: Requesting mem pool block of size %lu\n", + buffer_size); +#endif + // Get the buffer from the pool. + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &local_mem_buf_A_s); + + D_A_pack = bli_mem_buffer(&local_mem_buf_A_s); + } + + /* + * The computation loop runs for Z_MRxN columns of C matrix, thus + * accessing the Z_MRxK A matrix data and KxNR B matrix data. + * The computation is organized as inner loops of dimension Z_MRxNR. + */ + // Process D_MR rows of C matrix at a time. + for (row_idx = 0; (row_idx + (Z_MR - 1)) < M; row_idx += Z_MR) + { + col_idx_start = 0; + tA_packed = A; + row_idx_packed = row_idx; + lda_packed = lda; + + /** + * This is the part of the pack and compute optimization. + * During the first column iteration, we store the accessed A + * matrix into contiguous static memory. This helps to keep te A + * matrix in Cache and aviods the TLB misses. + */ + if (required_packing_A) + { + col_idx = 0; + + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + tA_packed = D_A_pack; + +#ifdef BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); +#endif + // clear scratch registers. + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B + // matrix i data and multiplies it with + // the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd( + (double const *)tA); + ymm1 = _mm256_loadu_pd( + (double const *)(tA + 2)); + _mm256_storeu_pd( + (double *)tA_packed, ymm0); + _mm256_storeu_pd( + (double *) + (tA_packed + 2), ymm1); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) * + 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + tA_packed += Z_MR; + } + + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd( + (double const *)tA); + ymm1 = _mm256_loadu_pd( + (double const *)(tA + 2)); + _mm256_storeu_pd( + (double *)tA_packed, ymm0); + _mm256_storeu_pd( + (double *)(tA_packed + 2) + , ymm1); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + tA_packed += Z_MR; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd( + (double const *)tA); + ymm1 = _mm256_loadu_pd( + (double const *)(tA + 2)); + _mm256_storeu_pd( + (double *)tA_packed, ymm0); + _mm256_storeu_pd( + (double *)(tA_packed + 2) + , ymm1); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + tA_packed += Z_MR; + } + + } + else //handles non-transpose case + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd( + (double const *)tA); + ymm1 = _mm256_loadu_pd( + (double const *)(tA + 2)); + _mm256_storeu_pd( + (double *)tA_packed, ymm0); + _mm256_storeu_pd( + (double *)(tA_packed + 2) + , ymm1); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + tA_packed += Z_MR; + } + } + + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + ymm13 = _mm256_addsub_pd(ymm13, ymm15); + + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm13, ymm14); + ymm13 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + (&beta_cast->imag)); + + + BLIS_SET_YMM_REG_ZEROS + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + // col 2 + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc + 2)); + ymm16 = _mm256_fmadd_pd(ymm0, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm0, ymm3, ymm17); + + // col 3 + ymm0 = _mm256_loadu_pd((double const *) + (tC + (ldc * 2))); + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + (ldc * 2) + 2)); + ymm20 = _mm256_fmadd_pd(ymm0, ymm2, ymm20); + ymm21 = _mm256_fmadd_pd(ymm0, ymm3, ymm21); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + ymm21 = _mm256_permute_pd(ymm21, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + ymm20 = _mm256_addsub_pd(ymm20, ymm21); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + ymm10 = _mm256_add_pd(ymm10, ymm18); + ymm13 = _mm256_add_pd(ymm13, ymm20); + + _mm256_storeu_pd((double *)tC, ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + _mm256_storeu_pd((double *)(tC + 2), ymm12); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + _mm256_storeu_pd((double *)(tC + 2), ymm13); + + // modify the pointer arithematic to use packed A matrix. + col_idx_start = NR; + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = Z_MR; + } + // Process NR columns of C matrix at a time. + for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; + col_idx += NR) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + +#ifdef BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); +#endif + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd( + (double const *)tA); + ymm1 = _mm256_loadu_pd( + (double const *)(tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K The inner loop broadcasts + // the B matrix data and multiplies it + // with the A matrix. This loop is + // processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else //handles non-transpose case + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K The inner loop broadcasts the + // B matrix data and multiplies it with + // the A matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + ymm13 = _mm256_addsub_pd(ymm13, ymm15); + + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm13, ymm14); + ymm13 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + + BLIS_SET_YMM_REG_ZEROS + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc + 2)); + ymm16 = _mm256_fmadd_pd(ymm0, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm0, ymm3, ymm17); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2)); + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2 + 2)); + ymm20 = _mm256_fmadd_pd(ymm0, ymm2, ymm20); + ymm21 = _mm256_fmadd_pd(ymm0, ymm3, ymm21); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + ymm21 = _mm256_permute_pd(ymm21, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + ymm20 = _mm256_addsub_pd(ymm20, ymm21); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + ymm10 = _mm256_add_pd(ymm10, ymm18); + ymm13 = _mm256_add_pd(ymm13, ymm20); + + _mm256_storeu_pd((double *)tC, ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + _mm256_storeu_pd((double *)(tC + 2), ymm12); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + _mm256_storeu_pd((double *)(tC + 2), ymm13); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + + tptr += (tb_inc_row * 2); + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + + tptr += (tb_inc_row * 2); + tA += lda; + } + + } + else //handles non-transpose case + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc + 2)); + ymm16 = _mm256_fmadd_pd(ymm0, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm0, ymm3, ymm17); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + + _mm256_storeu_pd((double *)(tC + 0), ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + _mm256_storeu_pd((double *)(tC + 2), ymm12); + } + + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and multiplies it with the A + // matrix. This loop is processing + // Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += (tb_inc_row * 2); + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are + //multiplied with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += (tb_inc_row * 2); + tA += lda; + } + } + else //handles non-transpose case + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + + _mm256_storeu_pd((double *)tC, ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + } + } + m_remainder = M - row_idx; + + if ((m_remainder == 3)) + { + m_remainder -= 3; + __m128d xmm0; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + + BLIS_SET_ALL_YMM_REG_ZEROS + + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *)(tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *)(tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + ymm13 = _mm256_addsub_pd(ymm13, ymm15); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm13, ymm14); + ymm13 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc)); + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + ymm16 = _mm256_fmadd_pd(ymm1, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm1, ymm3, ymm17); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2)); + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc * 2 + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + ymm20 = _mm256_fmadd_pd(ymm1, ymm2, ymm20); + ymm21 = _mm256_fmadd_pd(ymm1, ymm3, ymm21); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + ymm21 = _mm256_permute_pd(ymm21, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + ymm20 = _mm256_addsub_pd(ymm20, ymm21); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + ymm10 = _mm256_add_pd(ymm10, ymm18); + ymm13 = _mm256_add_pd(ymm13, ymm20); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + xmm0 = _mm256_extractf128_pd(ymm12, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + xmm0 = _mm256_extractf128_pd(ymm13, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0 + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0 + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + xmm0 = _mm_loadu_pd((double const *)(tC + ldc + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + ymm16 = _mm256_fmadd_pd(ymm1, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm1, ymm3, ymm17); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + xmm0 = _mm256_extractf128_pd(ymm12, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + } + if ((m_remainder == 2)) + { + m_remainder -= 2; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + + + BLIS_SET_ALL_YMM_REG_ZEROS + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing Z_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + + BLIS_SET_YMM_REG_ZEROS + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2)); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm10 = _mm256_add_pd(ymm10, ymm18); + + _mm256_storeu_pd((double *)tC, ymm8); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + + _mm256_storeu_pd((double *)tC, ymm8); + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + + _mm256_storeu_pd((double *)tC, ymm8); + } + } + if ((m_remainder == 1)) + { + m_remainder -= 1; + __m128d xmm0; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + xmm0 = _mm_loadu_pd((double const *)(tC + ldc)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + xmm0 = _mm_loadu_pd((double const *)(tC + ldc * 2)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm10 = _mm256_add_pd(ymm10, ymm18); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + + tC += ldc; + + xmm0 = _mm256_extractf128_pd(ymm9, 0); + _mm_storeu_pd((double *)tC, xmm0); + + tC += ldc; + xmm0 = _mm256_extractf128_pd(ymm10, 0); + _mm_storeu_pd((double *)tC, xmm0); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + xmm0 = _mm_loadu_pd((double const *)(tC + ldc)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + tC += ldc; + xmm0 = _mm256_extractf128_pd(ymm9, 0); + _mm_storeu_pd((double *)tC, xmm0); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda; + } + + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + + } + } + // Return the buffer to pool + if ((required_packing_A == 1) && bli_mem_is_alloc( &local_mem_buf_A_s )) { +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_zgemm_small(): releasing mem pool block\n" ); +#endif + bli_membrk_release(&rntm, + &local_mem_buf_A_s); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return BLIS_SUCCESS; + } + else + { + AOCL_DTL_TRACE_EXIT_ERR( + AOCL_DTL_LEVEL_INFO, + "Invalid dimesions for small gemm." + ); + return BLIS_NONCONFORMAL_DIMENSIONS; + } +}; + +static err_t bli_zgemm_small_At + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); + + bool conjtransa = bli_obj_has_conj(a); + bool conjtransb = bli_obj_has_conj(b); + + gint_t M = bli_obj_length( c ); // number of rows of Matrix C + gint_t N = bli_obj_width( c ); // number of columns of Matrix C + gint_t K = bli_obj_width_after_trans( a ); // number of columns of OP(A) + + + if (N<3) //Implemenation assumes that N is atleast 3. + { + AOCL_DTL_TRACE_EXIT_ERR( + AOCL_DTL_LEVEL_INFO, + "N < 3, cannot be processed by small gemm" + ); + return BLIS_NOT_YET_IMPLEMENTED; + } + + if( M && N && K ) + { + guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A) + guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B) + guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C + guint_t row_idx, col_idx, k; + dcomplex *A = bli_obj_buffer_at_off(a); //pointer to elements of Matrix A + dcomplex *B = bli_obj_buffer_at_off(b); //pointer to elements of Matrix B + dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C + + dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack; + dcomplex *tA_packed; // temprorary pointer to hold packed A memory pointer + guint_t row_idx_packed; //packed A memory row index + guint_t lda_packed; //lda of packed A + dim_t tb_inc_row = 1; // row stride of matrix B + dim_t tb_inc_col = ldb; // column stride of matrix B + + dcomplex *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = bli_obj_buffer_for_1x1(BLIS_DCOMPLEX, alpha); + beta_cast = bli_obj_buffer_for_1x1(BLIS_DCOMPLEX, beta); + + gint_t required_packing_A = 1; + mem_t local_mem_buf_A_s; + dcomplex *D_A_pack = NULL; + rntm_t rntm; + + if( bli_obj_has_trans( b ) ) + { + tb_inc_col = 1; // switch row and column strides + tb_inc_row = ldb; + } + + __m256d ymm4, ymm5, ymm6, ymm7; + __m256d ymm8, ymm9, ymm10, ymm11; + __m256d ymm12, ymm13, ymm14, ymm15; + __m256d ymm16, ymm17, ymm18, ymm19, ymm20, ymm21; + __m256d ymm0, ymm1, ymm2, ymm3; + + gint_t n_remainder; // If the N is non multiple of 3.(N%3) + gint_t m_remainder; // If the M is non multiple of 16.(M%16) + + //checking whether beta value is zero. + //if true, we should perform C=alpha * A*B operation + //instead of C = beta * C + alpha * (A * B) + bool is_beta_non_zero = 0; + if(!bli_obj_equals(beta, &BLIS_ZERO)) + is_beta_non_zero = 1; + + /* + * This function was using global array to pack part of A input when + * needed. + * However, using this global array make the function non-reentrant. + * Instead of using a global array we should allocate buffer for each + * invocation. + * Since the buffer size is too big or stack and doing malloc every time + * will be too expensive, + * better approach is to get the buffer from the pre-allocated pool and + * return + * it the pool once we are doing. + * + * In order to get the buffer from pool, we need access to memory broker, + * currently this function is not invoked in such a way that it can + * receive + * the memory broker (via rntm). Following hack will get the global memory + * broker that can be use it to access the pool. + * + * Note there will be memory allocation at least on first innovation + * as there will not be any pool created for this size. + * Subsequent invocations will just reuse the buffer from the pool. + */ + + bli_rntm_init_from_global( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + // Get the current size of the buffer pool for A block packing. + // We will use the same size to avoid pool re-initliazaton + siz_t buffer_size = bli_pool_block_size( + bli_membrk_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), + bli_rntm_membrk(&rntm))); + + // + // This kernel assumes that "A" will be unpackged if N <= 3. + // Usually this range (N <= 3) is handled by SUP, however, + // if SUP is disabled or for any other condition if we do + // enter this kernel with N <= 3, we want to make sure that + // "A" remains unpacked. + // + // If this check is removed it will result in the crash as + // reported in CPUPL-587. + // + + if ((N < 3) || ((Z_MR * K) << 3) > buffer_size) + { + required_packing_A = 0; + return BLIS_NOT_YET_IMPLEMENTED; + } + + if (required_packing_A == 1) + { +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemm_small: Requesting mem pool block of size %lu\n", + buffer_size); +#endif + // Get the buffer from the pool. + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &local_mem_buf_A_s); + + D_A_pack = bli_mem_buffer(&local_mem_buf_A_s); + } + + /* + * The computation loop runs for D_MRxN columns of C matrix, thus + * accessing the D_MRxK A matrix data and KxNR B matrix data. + * The computation is organized as inner loops of dimension D_MRxNR. + */ + // Process D_MR rows of C matrix at a time. + for (row_idx = 0; (row_idx + (Z_MR - 1)) < M; row_idx += Z_MR) + { + + tA = A + row_idx * lda; + tA_packed = D_A_pack; + lda_packed = Z_MR; + + // Pack 16xk of matrix A into buffer + // continuous access for A and strided stores to B + for(inc_t x = 0; (x) < 2; x += 1) + { + dcomplex* tA_temp = tA; + + for(k = 0; (k+1) < K; k += 2) + { + ymm0 = _mm256_loadu_pd((double const *) + (tA_temp + 0 * lda)); + ymm2 = _mm256_loadu_pd((double const *) + (tA_temp + 1 * lda)); + + ymm6 = _mm256_permute2f128_pd(ymm0,ymm2,0x20); + ymm7 = _mm256_permute2f128_pd(ymm0,ymm2,0x31); + + _mm256_storeu_pd((double *) + (tA_packed + 0 * lda_packed), + ymm6); + _mm256_storeu_pd((double *) + (tA_packed + 1 * lda_packed), + ymm7); + + tA_temp += 2; + tA_packed += 2 * lda_packed; + } + + for(; k < K; k += 1) + { + tA_packed[0].real = tA_temp[0 * lda].real; + tA_packed[0].imag = tA_temp[0 * lda].imag; + tA_packed[1].real = tA_temp[1 * lda].real; + tA_packed[1].imag = tA_temp[1 * lda].imag; + + tA_temp += 1; + tA_packed += lda_packed; + } + + tA += 2 * lda; + tA_packed = D_A_pack + (x + 1)*2; + } + + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = Z_MR; + + // Process NR columns of C matrix at a time. + for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + +#ifdef BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); +#endif + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + ymm13 = _mm256_addsub_pd(ymm13, ymm15); + + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm13, ymm14); + ymm13 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + (&beta_cast->imag)); + + + + BLIS_SET_YMM_REG_ZEROS + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + // col 2 + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc)); + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc + 2)); + ymm16 = _mm256_fmadd_pd(ymm0, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm0, ymm3, ymm17); + + // col 3 + ymm0 = _mm256_loadu_pd((double const *) + (tC + (ldc * 2))); + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + (ldc * 2) + 2)); + ymm20 = _mm256_fmadd_pd(ymm0, ymm2, ymm20); + ymm21 = _mm256_fmadd_pd(ymm0, ymm3, ymm21); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + ymm21 = _mm256_permute_pd(ymm21, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + ymm20 = _mm256_addsub_pd(ymm20, ymm21); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + ymm10 = _mm256_add_pd(ymm10, ymm18); + ymm13 = _mm256_add_pd(ymm13, ymm20); + + _mm256_storeu_pd((double *)tC, ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + _mm256_storeu_pd((double *)(tC + 2), ymm12); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + _mm256_storeu_pd((double *)(tC + 2), ymm13); + + } + n_remainder = N - col_idx; + + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + + + BLIS_SET_ALL_YMM_REG_ZEROS + double *tptr = (double *)tB; + + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const*)tA); + ymm1 = _mm256_loadu_pd((double const*) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + + + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc + 2)); + ymm16 = _mm256_fmadd_pd(ymm0, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm0, ymm3, ymm17); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + + _mm256_storeu_pd((double *)(tC + 0), ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + _mm256_storeu_pd((double *)(tC + 2), ymm12); + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + BLIS_SET_ALL_YMM_REG_ZEROS + double *tptr = (double *)tB; + + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *)(tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((double const *)(tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd((double const *)(tptr + tb_inc_col * 0 + 1)); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + tptr += tb_inc_row*2; + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm1 = _mm256_loadu_pd((double const *) + (tA + 2)); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + + + BLIS_SET_YMM_REG_ZEROS + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + 2)); + ymm6 = _mm256_fmadd_pd(ymm0, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + + _mm256_storeu_pd((double *)tC, ymm8); + _mm256_storeu_pd((double *)(tC + 2), ymm11); + } + } + + m_remainder = M - row_idx; + if ((m_remainder == 3)) + { + m_remainder -= 3; + __m128d xmm0; + + tA = A + row_idx * lda; + tA_packed = D_A_pack; + lda_packed = 3; + { + dcomplex* tA_temp = tA; + + for(k = 0; (k+1) < K; k += 2) + { + ymm0 = _mm256_loadu_pd((double const *) + (tA_temp + 0 * lda)); + ymm2 = _mm256_loadu_pd((double const *) + (tA_temp + 1 * lda)); + ymm3 = _mm256_loadu_pd((double const *) + (tA_temp + 2 * lda)); + + ymm6 = _mm256_permute2f128_pd(ymm0,ymm2,0x20); + ymm7 = _mm256_permute2f128_pd(ymm0,ymm2,0x31); + + _mm256_storeu_pd((double *) + (tA_packed + 0 * lda_packed), + ymm6); + xmm0 = _mm256_extractf128_pd(ymm3, 0); + _mm_storeu_pd((double *) + (tA_packed + 0 * lda_packed + 2), + xmm0); + + _mm256_storeu_pd((double *) + (tA_packed + 1 * lda_packed), + ymm7); + xmm0 = _mm256_extractf128_pd(ymm3, 1); + _mm_storeu_pd((double *) + (tA_packed + 1 * lda_packed + 2), + xmm0); + + tA_temp += 2; + tA_packed += 2 * lda_packed; + } + + for(; k < K; k += 1) + { + tA_packed[0].real = tA_temp[0 * lda].real; + tA_packed[0].imag = tA_temp[0 * lda].imag; + tA_packed[1].real = tA_temp[1 * lda].real; + tA_packed[1].imag = tA_temp[1 * lda].imag; + tA_packed[2].real = tA_temp[2 * lda].real; + tA_packed[2].imag = tA_temp[2 * lda].imag; + + tA_temp += 1; + tA_packed += lda_packed; + } + } + + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = 3; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm13 = _mm256_fmadd_pd(ymm1, ymm2, ymm13); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + ymm13 = _mm256_addsub_pd(ymm13, ymm15); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm13, ymm14); + ymm13 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc)); + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + ymm16 = _mm256_fmadd_pd(ymm1, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm1, ymm3, ymm17); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2)); + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc * 2 + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + ymm20 = _mm256_fmadd_pd(ymm1, ymm2, ymm20); + ymm21 = _mm256_fmadd_pd(ymm1, ymm3, ymm21); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + ymm21 = _mm256_permute_pd(ymm21, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + ymm20 = _mm256_addsub_pd(ymm20, ymm21); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + ymm10 = _mm256_add_pd(ymm10, ymm18); + ymm13 = _mm256_add_pd(ymm13, ymm20); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + xmm0 = _mm256_extractf128_pd(ymm12, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + xmm0 = _mm256_extractf128_pd(ymm13, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col + * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd((tptr + + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm12 = _mm256_fmadd_pd(ymm1, ymm2, ymm12); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm12 = _mm256_addsub_pd(ymm12, ymm7); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm12, ymm0); + ymm14 = _mm256_mul_pd(ymm12, ymm14); + ymm12 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc)); + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + ymm16 = _mm256_fmadd_pd(ymm1, ymm2, ymm16); + ymm17 = _mm256_fmadd_pd(ymm1, ymm3, ymm17); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm17 = _mm256_permute_pd(ymm17, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm16 = _mm256_addsub_pd(ymm16, ymm17); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm12 = _mm256_add_pd(ymm12, ymm16); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + xmm0 = _mm256_extractf128_pd(ymm12, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + ymm1 = _mm256_mul_pd(ymm1, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + xmm0 = _mm_loadu_pd((double const *) + (tA + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm11 = _mm256_fmadd_pd(ymm1, ymm2, ymm11); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm11 = _mm256_addsub_pd(ymm11, ymm5); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm11, ymm0); + ymm14 = _mm256_mul_pd(ymm11, ymm14); + ymm11 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + xmm0 = _mm_loadu_pd((double const *)(tC + 2)); + ymm1 = _mm256_insertf128_pd(ymm1, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm1, ymm2, ymm6); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm7 = _mm256_permute_pd(ymm7, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm6 = _mm256_addsub_pd(ymm6, ymm7); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm11 = _mm256_add_pd(ymm11, ymm6); + + _mm256_storeu_pd((double *)tC, ymm8); + xmm0 = _mm256_extractf128_pd(ymm11, 0); + _mm_storeu_pd((double *)(tC + 2), xmm0); + } + } + if ((m_remainder == 2)) + { + m_remainder -= 2; + + tA = A + row_idx * lda; + tA_packed = D_A_pack; + lda_packed = 2; + + { + dcomplex* tA_temp = tA; + + for(k = 0; (k+1) < K; k += 2) + { + ymm0 = _mm256_loadu_pd((double const *) + (tA_temp + 0 * lda)); + ymm2 = _mm256_loadu_pd((double const *) + (tA_temp + 1 * lda)); + + ymm6 = _mm256_permute2f128_pd(ymm0,ymm2,0x20); + ymm7 = _mm256_permute2f128_pd(ymm0,ymm2,0x31); + + _mm256_storeu_pd((double *) + (tA_packed + 0 * lda_packed), + ymm6); + _mm256_storeu_pd((double *) + (tA_packed + 1 * lda_packed), + ymm7); + + tA_temp += 2; + tA_packed += 2 * lda_packed; + } + + for(; k < K; k += 1) + { + tA_packed[0].real = tA_temp[0 * lda].real; + tA_packed[0].imag = tA_temp[0 * lda].imag; + tA_packed[1].real = tA_temp[1 * lda].real; + tA_packed[1].imag = tA_temp[1 * lda].imag; + + tA_temp += 1; + tA_packed += lda_packed; + } + } + + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = 2; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + BLIS_SET_YMM_REG_ZEROS + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc)); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + ymm0 = _mm256_loadu_pd((double const *) + (tC + ldc * 2)); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm10 = _mm256_add_pd(ymm10, ymm18); + + _mm256_storeu_pd((double *)tC, ymm8); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm9); + + tC += ldc; + + _mm256_storeu_pd((double *)tC, ymm10); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + BLIS_SET_YMM_REG_ZEROS + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_loadu_pd((double const *)(tC + ldc)); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + + _mm256_storeu_pd((double *)tC, ymm8); + tC += ldc; + _mm256_storeu_pd((double *)tC, ymm9); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matri + // x data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matri + // x data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + ymm0 = _mm256_loadu_pd((double const *)tA); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm0 = _mm256_loadu_pd((double const *)tC); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + + _mm256_storeu_pd((double *)tC, ymm8); + } + } + if ((m_remainder == 1)) + { + m_remainder -= 1; + __m128d xmm0; + + tA = A + row_idx * lda; + tA_packed = D_A_pack; + lda_packed = 1; + + { + dcomplex* tA_temp = tA; + + for(k = 0; (k+1) < K; k += 2) + { + ymm0 = _mm256_loadu_pd((double const *) + (tA_temp + 0 * lda)); + + xmm0 = _mm256_extractf128_pd(ymm0, 0); + _mm_storeu_pd((double *) + (tA_packed + 0 * lda_packed), + xmm0); + + xmm0 = _mm256_extractf128_pd(ymm0, 1); + _mm_storeu_pd((double *)(tA_packed + 1 + * lda_packed), xmm0); + + tA_temp += 2; + tA_packed += 2 * lda_packed; + } + + for(; k < K; k += 1) + { + tA_packed[0].real = tA_temp[0 * lda].real; + tA_packed[0].imag = tA_temp[0 * lda].imag; + + tA_temp += 1; + tA_packed += lda_packed; + } + } + + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = 1; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + (tb_inc_col*2) + * 2 + 1)); + + ymm10 = _mm256_fmadd_pd(ymm0, ymm2, ymm10); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tptr += (tb_inc_row * 2); + tB += tb_inc_row; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + ymm14 = _mm256_permute_pd(ymm14, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + ymm10 = _mm256_addsub_pd(ymm10, ymm14); + // alpha, beta multiplication. + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm10, ymm0); + ymm14 = _mm256_mul_pd(ymm10, ymm14); + ymm10 = _mm256_hsub_pd(ymm15, ymm14); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + xmm0 = _mm_loadu_pd((double const *)(tC + ldc)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + xmm0 = _mm_loadu_pd((double const *) + (tC + ldc * 2)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm18 = _mm256_fmadd_pd(ymm0, ymm2, ymm18); + ymm19 = _mm256_fmadd_pd(ymm0, ymm3, ymm19); + + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + ymm19 = _mm256_permute_pd(ymm19, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + ymm18 = _mm256_addsub_pd(ymm18, ymm19); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + ymm10 = _mm256_add_pd(ymm10, ymm18); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + + tC += ldc; + + xmm0 = _mm256_extractf128_pd(ymm9, 0); + _mm_storeu_pd((double *)tC, xmm0); + + tC += ldc; + xmm0 = _mm256_extractf128_pd(ymm10, 0); + _mm_storeu_pd((double *)tC, xmm0); + } + n_remainder = N - col_idx; + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 2 + + 1)); + + ymm9 = _mm256_fmadd_pd(ymm0, ymm2, ymm9); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + ymm6 = _mm256_permute_pd(ymm6, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + ymm9 = _mm256_addsub_pd(ymm9, ymm6); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm9, ymm0); + ymm14 = _mm256_mul_pd(ymm9, ymm14); + ymm9 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + xmm0 = _mm_loadu_pd((double const *)(tC + ldc)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm14 = _mm256_fmadd_pd(ymm0, ymm2, ymm14); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + ymm15 = _mm256_permute_pd(ymm15, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + ymm14 = _mm256_addsub_pd(ymm14, ymm15); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + ymm9 = _mm256_add_pd(ymm9, ymm14); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + tC += ldc; + xmm0 = _mm256_extractf128_pd(ymm9, 0); + _mm_storeu_pd((double *)tC, xmm0); + } + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + + // clear scratch registers. + + BLIS_SET_ALL_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + double *tptr = (double *)tB; + if(conjtransa && conjtransb) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransa) + { + ymm20 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + ymm0 = _mm256_mul_pd(ymm0, ymm20); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else if(conjtransb) + { + ymm21 = _mm256_setr_pd(-1.0, -1.0, -1.0, -1.0); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix + // data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + ymm3 = _mm256_mul_pd(ymm3, ymm21); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + else + { + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matri + // x data and + // multiplies it with the A matrix. + ymm2 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0)); + ymm3 = _mm256_broadcast_sd( + (double const *) + (tptr + tb_inc_col * 0 + + 1)); + + //broadcasted matrix B elements are + //multiplied + //with matrix A columns. + xmm0 = _mm_loadu_pd((double const *)(tA)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm8 = _mm256_fmadd_pd(ymm0, ymm2, ymm8); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tptr += tb_inc_row*2; + tA += lda_packed; + } + } + ymm4 = _mm256_permute_pd(ymm4, 0x5); + + ymm8 = _mm256_addsub_pd(ymm8, ymm4); + + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_pd(( __m128d const*)alpha_cast); + ymm1 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + + ymm14 = _mm256_permute_pd(ymm0, 0x5); + ymm14 = _mm256_mul_pd(ymm14, ymm1); + ymm15 = _mm256_mul_pd(ymm8, ymm0); + ymm14 = _mm256_mul_pd(ymm8, ymm14); + ymm8 = _mm256_hsub_pd(ymm15, ymm14); + + + BLIS_SET_YMM_REG_ZEROS + xmm0 = _mm_setzero_pd(); + + ymm2 = _mm256_broadcast_sd((double const *) + &beta_cast->real); + ymm3 = _mm256_broadcast_sd((double const *) + &beta_cast->imag); + + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + xmm0 = _mm_loadu_pd((double const *)(tC)); + ymm0 = _mm256_insertf128_pd(ymm0, xmm0, 0); + + ymm4 = _mm256_fmadd_pd(ymm0, ymm2, ymm4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + } + ymm5 = _mm256_permute_pd(ymm5, 0x5); + + ymm4 = _mm256_addsub_pd(ymm4, ymm5); + + ymm8 = _mm256_add_pd(ymm8, ymm4); + + xmm0 = _mm256_extractf128_pd(ymm8, 0); + _mm_storeu_pd((double *)tC, xmm0); + + } + } + // Return the buffer to pool + if ((required_packing_A == 1) && bli_mem_is_alloc( &local_mem_buf_A_s )){ +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemm_small_At(): releasing mem pool block\n" ); +#endif + bli_membrk_release(&rntm, + &local_mem_buf_A_s); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return BLIS_SUCCESS; + } + else + { + AOCL_DTL_TRACE_EXIT_ERR( + AOCL_DTL_LEVEL_INFO, + "Invalid dimesions for dgemm_small_At." + ); + return BLIS_NONCONFORMAL_DIMENSIONS; + } +}; +#endif