From 9c292b79e2cbd2ee66350f7042a3a9b103e5c649 Mon Sep 17 00:00:00 2001 From: satish kumar nuggu Date: Thu, 29 Sep 2022 16:15:46 +0530 Subject: [PATCH] Fixed ASAN reported issues in [s/c]trsm small kernels Details: 1. Fixed the memory access paritial overflows for the variables AlphaVal,ones reported by ASAN. 2. Using 128 bit packed broadcast with the 64 bit data types after type casting would cause the garbage data to be filled in the destination register. 3. Fixed this issue by using set_ps instruction instead of broadcast. 4. In cases of n remainder being 1, extra elements were accessed that could cause out of memory access. Removed the extra element access. AMD-Internal: [CPUPL-2578][CPUPL-2587] Change-Id: Iaa918060c66287f2f46bcb9f69e9323f6707cf75 --- kernels/zen/3/bli_trsm_small.c | 1791 ++++++++++++++++++++++++++------ 1 file changed, 1473 insertions(+), 318 deletions(-) diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index 614acf5d8..023fa4a07 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -21044,8 +21044,6 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB xmm0 = _mm_loadu_ps((float *)(a01 + rs_a * 0 + loop_count*6)); _mm_storeu_ps((float *)(ptr_a10_dup + p_lda * 0 + loop_count*6), xmm0); - xmm0 = _mm_loadl_pi(xmm1,(__m64 *)(a01 + rs_a * 0 + 4 + loop_count*6)); - _mm_storel_pi((__m64 *)(ptr_a10_dup + p_lda * 0 + 4 + loop_count*6),xmm0); } } @@ -24872,8 +24870,6 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB xmm0 = _mm_loadu_ps((float *)(a01 + rs_a * 0 + loop_count*6)); _mm_storeu_ps((float *)(ptr_a10_dup + p_lda * 0 + loop_count*6), xmm0); - xmm0 = _mm_loadl_pi(xmm1,(__m64 *)(a01 + rs_a * 0 + 4 + loop_count*6)); - _mm_storel_pi((__m64 *)(ptr_a10_dup + p_lda * 0 + 4 + loop_count*6),xmm0); } } @@ -41889,7 +41885,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element ymm16 = _mm256_permute_ps(ymm16, 0x44);\ \ ymm0 = _mm256_loadu_ps((float const *)(b11));\ - ymm3 = _mm256_broadcast_ps((__m128 const *)&ones);\ + ymm3 = _mm256_broadcast_ps((__m128 const *)&ones_a);\ ymm3 = _mm256_permute_ps(ymm3, 0x44);\ /*in register transpose * ymm0,ymm1,ymm2 holds @@ -41939,7 +41935,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element \ ymm0 = _mm256_loadu_ps((float const *)(b11));\ ymm1 = _mm256_loadu_ps((float const *)(b11 + cs_b *1));\ - ymm3 = _mm256_broadcast_ps((__m128 const *)&ones);\ + ymm3 = _mm256_broadcast_ps((__m128 const *)&ones_a);\ ymm3 = _mm256_permute_ps(ymm3, 0x44);\ /*in register transpose * ymm0,ymm1,ymm2 holds @@ -41996,7 +41992,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element ymm0 = _mm256_loadu_ps((float const *)(b11));\ ymm1 = _mm256_loadu_ps((float const *)(b11 + cs_b *1));\ ymm2 = _mm256_loadu_ps((float const *)(b11 + cs_b *2));\ - ymm3 = _mm256_broadcast_ps((__m128 const *)&ones);\ + ymm3 = _mm256_broadcast_ps((__m128 const *)&ones_a);\ ymm3 = _mm256_permute_ps(ymm3, 0x44);\ /*in register transpose * ymm0,ymm1,ymm2 holds @@ -42059,7 +42055,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element ymm0 = _mm256_loadu_ps((float const *)(b11));\ ymm1 = _mm256_loadu_ps((float const *)(b11 + cs_b *1));\ ymm2 = _mm256_loadu_ps((float const *)(b11 + cs_b *2));\ - ymm3 = _mm256_broadcast_ps((__m128 const *)&ones);\ + ymm3 = _mm256_broadcast_ps((__m128 const *)&ones_a);\ ymm3 = _mm256_permute_ps(ymm3, 0x44);\ /*in register transpose * ymm0,ymm1,ymm2 holds @@ -42216,7 +42212,6 @@ BLIS_INLINE void ctrsm_small_pack_diag_element _mm256_storeu_ps((float *)(b11 + cs_b * 2 + 4), ymm2);\ } - BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ( obj_t* AlphaObj, @@ -42251,13 +42246,17 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed - scomplex AlphaVal = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex AlphaVal[2]; + AlphaVal[0] = *(scomplex *)AlphaObj->buffer; //value of alpha + AlphaVal[1] = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex *L = bli_obj_buffer_at_off(a); //pointer to matrix A scomplex *B = bli_obj_buffer_at_off(b); //pointer to matrix B scomplex *a10, *a11, *b01, *b11; //pointers that point to blocks for GEMM and TRSM - scomplex ones = {1.0, 1.0}; + float ones = 1.0; + float ones_a[4] = {1.0, 1.0,1.0,1.0}; bool is_unitdiag = bli_obj_has_unit_diag(a); //scratch registers @@ -42270,14 +42269,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB __m128 xmm0, xmm1, xmm2, xmm3, xmm4; __m128 xmm5; - xmm0 = _mm_setzero_ps(); + xmm0 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); xmm2 = _mm_setzero_ps(); xmm3 = _mm_setzero_ps(); xmm4 = _mm_setzero_ps(); xmm5 = _mm_setzero_ps(); - - gint_t required_packing_A = 1; + + gint_t required_packing_A = 1; mem_t local_mem_buf_A_s = {0}; scomplex *D_A_pack = NULL; scomplex d11_pack[d_mr] __attribute__((aligned(64))); @@ -42306,8 +42305,8 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB D_A_pack = bli_mem_buffer(&local_mem_buf_A_s); if(NULL==D_A_pack) return BLIS_NULL_POINTER; } - - /* + + /* Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM @@ -42392,14 +42391,20 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB */ ////extract a00 ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42416,7 +42421,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42431,7 +42439,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42446,7 +42457,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42461,7 +42475,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42476,7 +42493,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42491,7 +42511,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42506,7 +42529,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); + ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -42517,7 +42544,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42532,7 +42562,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42547,7 +42580,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42562,7 +42598,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42577,7 +42616,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42592,7 +42634,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42608,7 +42653,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -42620,7 +42668,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42635,7 +42686,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42650,7 +42704,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42665,7 +42722,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42680,7 +42740,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42696,7 +42759,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -42707,7 +42773,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42722,7 +42791,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42737,7 +42809,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42752,7 +42827,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42768,7 +42846,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 4)); + ymm1 = _mm256_set_ps((d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) @@ -42777,7 +42858,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB #endif a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42792,7 +42876,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42807,7 +42894,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42823,7 +42913,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 5)); + ymm1 = _mm256_set_ps((d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm13) @@ -42833,7 +42926,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42848,7 +42944,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42864,7 +42963,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 6)); + ymm1 = _mm256_set_ps((d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm14) @@ -42874,7 +42976,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -42890,7 +42995,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 7)); + ymm1 = _mm256_set_ps((d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm15) @@ -42919,8 +43027,8 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB BLIS_CTRSM_SMALL_GEMM_8mx2n(a10,b01,cs_b,p_lda,k_iter) float zero = 0.0; - ymm16 = _mm256_broadcast_ss(&AlphaVal.real); - ymm17 = _mm256_broadcast_ss(&AlphaVal.imag); + ymm16 = _mm256_broadcast_ss(&AlphaVal[0].real); + ymm17 = _mm256_broadcast_ss(&AlphaVal[0].imag); ymm2 = _mm256_broadcast_ss(&zero); ymm3 = _mm256_broadcast_ss(&zero); ymm6 = _mm256_broadcast_ss(&zero); @@ -42971,8 +43079,8 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB BLIS_CTRSM_SMALL_GEMM_8mx1n(a10,b01,cs_b,p_lda,k_iter) float zero = 0.0; - ymm16 = _mm256_broadcast_ss(&AlphaVal.real); - ymm17 = _mm256_broadcast_ss(&AlphaVal.imag); + ymm16 = _mm256_broadcast_ss(&AlphaVal[0].real); + ymm17 = _mm256_broadcast_ss(&AlphaVal[0].imag); ymm2 = _mm256_broadcast_ss(&zero); ymm3 = _mm256_broadcast_ss(&zero); ymm6 = _mm256_broadcast_ss(&zero); @@ -43040,14 +43148,20 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_permute2f128_ps(ymm18,ymm19,0x31); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43064,7 +43178,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43079,7 +43196,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43094,7 +43214,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43109,7 +43232,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43124,7 +43250,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43139,7 +43268,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43154,7 +43286,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -43165,7 +43300,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43180,7 +43318,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43195,7 +43336,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43210,7 +43354,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43225,7 +43372,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43240,7 +43390,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43256,7 +43409,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -43268,7 +43424,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43283,7 +43442,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43298,7 +43460,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43313,7 +43478,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43328,7 +43496,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43344,7 +43515,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -43355,7 +43529,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4) ); + ymm2 = _mm256_set_ps((a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real, + (a11 + cs_a*4)->imag,(a11 + cs_a*4)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43370,7 +43547,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43385,7 +43565,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43400,7 +43583,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43416,7 +43602,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 4)); + ymm1 = _mm256_set_ps((d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) @@ -43425,7 +43614,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB #endif a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5) ); + ymm2 = _mm256_set_ps((a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real, + (a11 + cs_a*5)->imag,(a11 + cs_a*5)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43440,7 +43632,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43455,7 +43650,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43471,7 +43669,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 5)); + ymm1 = _mm256_set_ps((d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm13) @@ -43481,7 +43682,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6) ); + ymm2 = _mm256_set_ps((a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real, + (a11 + cs_a*6)->imag,(a11 + cs_a*6)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43496,7 +43700,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43512,7 +43719,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 6)); + ymm1 = _mm256_set_ps((d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm14) @@ -43522,7 +43732,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*7) ); + ymm2 = _mm256_set_ps((a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real, + (a11 + cs_a*7)->imag,(a11 + cs_a*7)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43538,7 +43751,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm15 = _mm256_sub_ps(ymm15,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 7)); + ymm1 = _mm256_set_ps((d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm15) @@ -43654,14 +43870,20 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43677,7 +43899,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43691,7 +43916,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43706,7 +43934,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -43717,7 +43948,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43731,7 +43965,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43747,7 +43984,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -43759,7 +43999,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43775,7 +44018,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -43803,7 +44049,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ///GEMM code begins/// BLIS_CTRSM_SMALL_GEMM_4mx2n(a10,b01,cs_b,p_lda,k_iter) BLIS_CTRSM_SMALL_NREG_TRANSPOSE_2x4(b11,cs_b,AlphaVal) - } + } else if(1 == n_rem) { ///GEMM code begins/// @@ -43813,14 +44059,20 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real, + (a11 + cs_a*1)->imag,(a11 + cs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43836,7 +44088,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43850,7 +44105,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43865,7 +44123,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -43876,7 +44137,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real, + (a11 + cs_a*2)->imag,(a11 + cs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43890,7 +44154,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43906,7 +44173,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -43918,7 +44188,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB a11 += rs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3) ); + ymm2 = _mm256_set_ps((a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real, + (a11 + cs_a*3)->imag,(a11 + cs_a*3)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -43934,7 +44207,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -44502,13 +44778,17 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed - scomplex AlphaVal = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex AlphaVal[2]; + AlphaVal[0] = *(scomplex *)AlphaObj->buffer; //value of alpha + AlphaVal[1] = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex *L = bli_obj_buffer_at_off(a); //pointer to matrix A scomplex *B = bli_obj_buffer_at_off(b); //pointer to matrix B scomplex *a10, *a11, *b01, *b11; //pointers that point to blocks for GEMM and TRSM - scomplex ones = {1.0, 1.0}; + float ones = 1.0; + float ones_a[4] = {1.0, 1.0,1.0,1.0}; bool is_unitdiag = bli_obj_has_unit_diag(a); //scratch registers @@ -44521,7 +44801,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB __m128 xmm0, xmm1, xmm2, xmm3, xmm4; __m128 xmm5; - xmm0 = _mm_setzero_ps(); + xmm0 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); xmm2 = _mm_setzero_ps(); xmm3 = _mm_setzero_ps(); @@ -44645,14 +44925,24 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB */ ////extract a00 ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 7)); + ymm1 = _mm256_set_ps((d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm15) #else BLIS_CTRSM_MUL(ymm15) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44668,7 +44958,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44683,7 +44980,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44698,7 +45002,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44713,7 +45024,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44728,7 +45046,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44743,7 +45068,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44758,7 +45090,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 6)); + ymm1 = _mm256_set_ps((d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm14) @@ -44766,7 +45101,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm14) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44781,7 +45123,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44796,7 +45145,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44811,7 +45167,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44826,7 +45189,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44841,7 +45211,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44857,7 +45234,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 5)); + ymm1 = _mm256_set_ps((d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -44867,7 +45247,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44882,7 +45269,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44897,7 +45291,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44912,7 +45313,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44927,7 +45335,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44943,7 +45358,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 4)); + ymm1 = _mm256_set_ps((d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) @@ -44952,7 +45370,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44967,7 +45392,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44982,7 +45414,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -44997,7 +45436,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45013,7 +45459,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -45021,7 +45470,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45036,7 +45492,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45051,7 +45514,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45067,7 +45537,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -45076,7 +45549,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45091,7 +45571,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45107,7 +45594,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -45116,7 +45606,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45132,7 +45629,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -45163,8 +45663,8 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_SMALL_GEMM_8mx2n(a10,b01,cs_b,p_lda,k_iter) float zero = 0.0; - ymm16 = _mm256_broadcast_ss(&AlphaVal.real); - ymm17 = _mm256_broadcast_ss(&AlphaVal.imag); + ymm16 = _mm256_broadcast_ss(&AlphaVal[0].real); + ymm17 = _mm256_broadcast_ss(&AlphaVal[0].imag); ymm2 = _mm256_broadcast_ss(&zero); ymm3 = _mm256_broadcast_ss(&zero); ymm6 = _mm256_broadcast_ss(&zero); @@ -45215,8 +45715,8 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_SMALL_GEMM_8mx1n(a10,b01,cs_b,p_lda,k_iter) float zero = 0.0; - ymm16 = _mm256_broadcast_ss(&AlphaVal.real); - ymm17 = _mm256_broadcast_ss(&AlphaVal.imag); + ymm16 = _mm256_broadcast_ss(&AlphaVal[0].real); + ymm17 = _mm256_broadcast_ss(&AlphaVal[0].imag); ymm2 = _mm256_broadcast_ss(&zero); ymm3 = _mm256_broadcast_ss(&zero); ymm6 = _mm256_broadcast_ss(&zero); @@ -45285,14 +45785,24 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 7)); + ymm1 = _mm256_set_ps((d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real, + (d11_pack + 7)->imag,(d11_pack + 7)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm15) #else BLIS_CTRSM_MUL(ymm15) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*6 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real, + (a11 + cs_a*6 + 7*rs_a)->imag, + (a11 + cs_a*6 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45309,7 +45819,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm14 = _mm256_sub_ps(ymm14,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real, + (a11 + cs_a*5 + 7*rs_a)->imag, + (a11 + cs_a*5 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45324,7 +45841,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real, + (a11 + cs_a*4 + 7*rs_a)->imag, + (a11 + cs_a*4 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45339,7 +45863,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real, + (a11 + cs_a*3 + 7*rs_a)->imag, + (a11 + cs_a*3 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45354,7 +45885,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real, + (a11 + cs_a*2 + 7*rs_a)->imag, + (a11 + cs_a*2 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45369,7 +45907,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real, + (a11 + cs_a*1 + 7*rs_a)->imag, + (a11 + cs_a*1 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45384,7 +45929,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 7*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real, + (a11 + cs_a*0 + 7*rs_a)->imag, + (a11 + cs_a*0 + 7*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45399,7 +45951,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 6)); + ymm1 = _mm256_set_ps((d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real, + (d11_pack + 6)->imag,(d11_pack + 6)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm14) @@ -45407,7 +45962,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm14) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*5 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real, + (a11 + cs_a*5 + 6*rs_a)->imag, + (a11 + cs_a*5 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45422,7 +45984,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real, + (a11 + cs_a*4 + 6*rs_a)->imag, + (a11 + cs_a*4 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45437,7 +46006,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real, + (a11 + cs_a*3 + 6*rs_a)->imag, + (a11 + cs_a*3 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45452,7 +46028,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real, + (a11 + cs_a*2 + 6*rs_a)->imag, + (a11 + cs_a*2 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45467,7 +46050,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real, + (a11 + cs_a*1 + 6*rs_a)->imag, + (a11 + cs_a*1 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45482,7 +46072,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 6*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real, + (a11 + cs_a*0 + 6*rs_a)->imag, + (a11 + cs_a*0 + 6*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45498,7 +46095,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 5)); + ymm1 = _mm256_set_ps((d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real, + (d11_pack + 5)->imag,(d11_pack + 5)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -45508,7 +46108,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*4 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real, + (a11 + cs_a*4 + 5*rs_a)->imag, + (a11 + cs_a*4 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45523,7 +46130,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real, + (a11 + cs_a*3 + 5*rs_a)->imag, + (a11 + cs_a*3 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45538,7 +46152,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real, + (a11 + cs_a*2 + 5*rs_a)->imag, + (a11 + cs_a*2 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45553,7 +46174,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real, + (a11 + cs_a*1 + 5*rs_a)->imag, + (a11 + cs_a*1 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45568,7 +46196,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 5*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real, + (a11 + cs_a*0 + 5*rs_a)->imag, + (a11 + cs_a*0 + 5*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45584,7 +46219,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 4)); + ymm1 = _mm256_set_ps((d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real, + (d11_pack + 4)->imag,(d11_pack + 4)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) @@ -45593,7 +46231,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*3 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real, + (a11 + cs_a*3 + 4*rs_a)->imag, + (a11 + cs_a*3 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45608,7 +46253,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real, + (a11 + cs_a*2 + 4*rs_a)->imag, + (a11 + cs_a*2 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45623,7 +46275,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real, + (a11 + cs_a*1 + 4*rs_a)->imag, + (a11 + cs_a*1 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45638,7 +46297,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 4*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real, + (a11 + cs_a*0 + 4*rs_a)->imag, + (a11 + cs_a*0 + 4*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45654,7 +46320,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -45662,7 +46331,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45677,7 +46353,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45692,7 +46375,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45708,7 +46398,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -45717,7 +46410,15 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real); + ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45732,7 +46433,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45748,7 +46456,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -45757,7 +46468,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45773,7 +46491,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -45891,7 +46612,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -45899,7 +46623,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45913,7 +46644,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45927,7 +46665,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45942,7 +46687,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -45951,7 +46699,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45965,7 +46720,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -45980,7 +46742,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -45989,7 +46754,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46004,7 +46776,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -46040,7 +46815,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB } ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 3)); + ymm1 = _mm256_set_ps((d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real, + (d11_pack + 3)->imag,(d11_pack + 3)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm11) @@ -46048,7 +46826,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real, + (a11 + cs_a*2 + 3*rs_a)->imag, + (a11 + cs_a*2 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46062,7 +46847,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real, + (a11 + cs_a*1 + 3*rs_a)->imag, + (a11 + cs_a*1 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46076,7 +46868,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 3*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real, + (a11 + cs_a*0 + 3*rs_a)->imag, + (a11 + cs_a*0 + 3*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46091,7 +46890,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -46100,7 +46902,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real, + (a11 + cs_a*1 + 2*rs_a)->imag, + (a11 + cs_a*1 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46114,7 +46923,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*0 + 2*rs_a)); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real, + (a11 + cs_a*0 + 2*rs_a)->imag, + (a11 + cs_a*0 + 2*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46129,7 +46945,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm9) @@ -46138,7 +46957,14 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real, + (a11 + cs_a*0 + 1*rs_a)->imag, + (a11 + cs_a*0 + 1*rs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46153,7 +46979,10 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -46696,7 +47525,6 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB return BLIS_SUCCESS; } - BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ( obj_t* AlphaObj, @@ -46731,7 +47559,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed - scomplex AlphaVal = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex AlphaVal[2]; + AlphaVal[0] = *(scomplex *)AlphaObj->buffer; //value of alpha + AlphaVal[1] = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex *L = bli_obj_buffer_at_off(a); //pointer to matrix A scomplex *B = bli_obj_buffer_at_off(b); //pointer to matrix B @@ -46750,7 +47581,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB __m128 xmm0, xmm1, xmm2; __m128 xmm5; - xmm0 = _mm_setzero_ps(); + xmm0 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); xmm2 = _mm_setzero_ps(); xmm5 = _mm_setzero_ps(); @@ -46870,7 +47701,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB */ ////extract a00 ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm12, ymm13) @@ -46878,7 +47712,14 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm12) BLIS_CTRSM_MUL(ymm13) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a *2 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46902,7 +47743,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46923,7 +47767,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm10, ymm11) @@ -46932,7 +47779,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -46952,7 +47802,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -46989,14 +47842,24 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB // Load b11 of size 4x6 and multiply with alpha BLIS_PRE_CTRSM_SMALL_3x4(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) #else BLIS_CTRSM_MUL(ymm12) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a *2 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47014,7 +47877,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47028,7 +47894,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -47036,7 +47905,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47050,7 +47922,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47087,14 +47962,24 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_PRE_CTRSM_SMALL_3x3(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) #else BLIS_CTRSM_MUL(ymm12) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a *2 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47111,7 +47996,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47125,7 +48013,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -47133,7 +48024,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47147,7 +48041,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47195,14 +48092,24 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_PRE_CTRSM_SMALL_3x2(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) #else BLIS_CTRSM_MUL(ymm12) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a *2 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47220,7 +48127,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47234,7 +48144,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -47242,7 +48155,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47256,7 +48172,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47294,14 +48213,24 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_PRE_CTRSM_SMALL_3x1(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm12) #else BLIS_CTRSM_MUL(ymm12) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a *2 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real, + (a11 + cs_a *2 + rs_a*1)->imag, + (a11 + cs_a *2 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47318,7 +48247,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*2) ); + ymm2 = _mm256_set_ps((a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real, + (a11 + cs_a *2)->imag,(a11 + cs_a *2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47332,7 +48264,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -47340,7 +48275,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47354,7 +48292,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47508,7 +48449,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm11 = _mm256_sub_ps(ymm19, ymm11); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm10, ymm11) @@ -47517,7 +48461,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_CTRSM_MUL(ymm11) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47537,7 +48484,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm9 = _mm256_sub_ps(ymm9,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47587,14 +48537,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) #else BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47610,7 +48566,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47663,14 +48622,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) #else BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47686,7 +48651,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47743,14 +48711,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) #else BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47766,7 +48740,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47817,14 +48794,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) #else BLIS_CTRSM_MUL(ymm10) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + cs_a*1) ); + ymm2 = _mm256_set_ps((a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real, + (a11 + cs_a)->imag,(a11 + cs_a)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -47840,7 +48823,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm8 = _mm256_sub_ps(ymm8,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -47917,7 +48903,6 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB _mm_storeu_ps((float *)(ptr_a10_dup + p_lda * 0 + x*3), xmm0); xmm0 = _mm_loadl_pi(xmm1,(__m64 *)(a01 + rs_a * 0 + 2 + x*3)); _mm_storel_pi((__m64 *)(ptr_a10_dup + p_lda * 0 + 2 + x*3),xmm0); - } } @@ -47963,7 +48948,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm8, ymm9) @@ -48003,7 +48991,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -48043,7 +49034,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -48083,7 +49077,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -48120,7 +49117,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB ymm8 = _mm256_sub_ps(ymm19, ymm8); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -48179,7 +49179,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed - scomplex AlphaVal = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex AlphaVal[2]; + AlphaVal[0] = *(scomplex *)AlphaObj->buffer; //value of alpha + AlphaVal[1] = *(scomplex *)AlphaObj->buffer; //value of alpha + scomplex *L = bli_obj_buffer_at_off(a); //pointer to matrix A scomplex *B = bli_obj_buffer_at_off(b); //pointer to matrix B @@ -48197,7 +49200,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB __m128 xmm0, xmm1, xmm2; __m128 xmm5; - xmm0 = _mm_setzero_ps(); + xmm0 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); xmm2 = _mm_setzero_ps(); xmm5 = _mm_setzero_ps(); @@ -48318,7 +49321,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB */ ////extract a00 ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm8, ymm9) @@ -48326,7 +49332,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_CTRSM_MUL(ymm8) BLIS_CTRSM_MUL(ymm9) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48350,7 +49359,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48372,7 +49384,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm10, ymm11) @@ -48384,7 +49399,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB a11 += cs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48405,7 +49423,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm13 = _mm256_sub_ps(ymm13,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -48444,14 +49465,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_PRE_CTRSM_SMALL_3x4(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48468,7 +49495,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48484,7 +49514,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -48495,7 +49528,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB a11 += cs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48511,7 +49547,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -48546,14 +49585,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_PRE_CTRSM_SMALL_3x3(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48570,7 +49615,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48586,7 +49634,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -48597,7 +49648,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB a11 += cs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48613,7 +49667,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -48659,14 +49716,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_PRE_CTRSM_SMALL_3x2(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48683,7 +49746,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48699,7 +49765,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -48710,7 +49779,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB a11 += cs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48726,7 +49798,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -48765,14 +49840,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_PRE_CTRSM_SMALL_3x1(AlphaVal,b11,cs_b) ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48789,7 +49870,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48805,7 +49889,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm10) @@ -48816,7 +49903,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB a11 += cs_a; - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*2) ); + ymm2 = _mm256_set_ps((a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real, + (a11 + rs_a*2)->imag,(a11 + rs_a*2)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -48832,7 +49922,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm12 = _mm256_sub_ps(ymm12,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 2)); + ymm1 = _mm256_set_ps((d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real, + (d11_pack + 2)->imag,(d11_pack + 2)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -48983,7 +50076,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm8, ymm9) @@ -48991,7 +50087,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_CTRSM_MUL(ymm8) BLIS_CTRSM_MUL(ymm9) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -49014,7 +50113,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm11 = _mm256_sub_ps(ymm11,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -49064,14 +50166,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -49088,7 +50196,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -49142,14 +50253,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -49166,7 +50283,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -49202,6 +50322,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ///GEMM implementation starts/// BLIS_CTRSM_SMALL_GEMM_2nx2m(a01,b10,cs_b,p_lda,k_iter) ymm16 = _mm256_broadcast_ps(( __m128 const *)(&AlphaVal)); + ymm16 = _mm256_permute_ps(ymm16, 0x44); xmm0 = _mm_loadu_ps((float const *)(b11)); @@ -49224,14 +50345,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -49248,7 +50375,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -49301,14 +50431,20 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm10 = _mm256_sub_ps(ymm19, ymm10); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) #else BLIS_CTRSM_MUL(ymm8) #endif - ymm2 = _mm256_broadcast_ps((__m128 const *) (a11 + rs_a*1) ); + ymm2 = _mm256_set_ps((a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real, + (a11 + rs_a*1)->imag,(a11 + rs_a*1)->real); ymm2 = _mm256_permute_ps(ymm2, 0x44); if(conjtransa) { @@ -49325,7 +50461,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm16 = _mm256_fmaddsub_ps(ymm1, ymm2, ymm16); ymm10 = _mm256_sub_ps(ymm10,ymm16); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack + 1)); + ymm1 = _mm256_set_ps((d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real, + (d11_pack + 1)->imag,(d11_pack + 1)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION @@ -49445,7 +50584,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_TWO_DIV(ymm8, ymm9) @@ -49485,7 +50627,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -49528,7 +50673,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -49571,7 +50719,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -49611,7 +50762,10 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB ymm8 = _mm256_sub_ps(ymm19, ymm8); ymm18 = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); - ymm1 = _mm256_broadcast_ps(( __m128 const *)(d11_pack)); + ymm1 = _mm256_set_ps((d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real, + (d11_pack)->imag,(d11_pack)->real); ymm1 = _mm256_permute_ps(ymm1, 0x44); #ifndef BLIS_ENABLE_TRSM_PREINVERSION BLIS_CTRSM_DIV(ymm8) @@ -49638,6 +50792,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB return BLIS_SUCCESS; } + /* * Check if the TRSM small path should be taken for this * input and threads combination