From ef0143cc1417e4815e4cafd5a464cc83fe7a1e86 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 23 Aug 2014 14:02:27 -0500 Subject: [PATCH] Renamed _ri, _ri3 packm ukernels to _4m, _3m. Details: - Renamed packm ukernels, _cxk dispatcher, and structure-aware _cxk helper functions to use _4m and _3m instead of _ri and _ri3 suffixes. - Updated names of cpp macros that correspond to packm ukernels. --- frame/1m/packm/bli_packm.h | 4 +- frame/1m/packm/bli_packm_blk_var3.c | 70 +- frame/1m/packm/bli_packm_blk_var4.c | 6 +- ...bli_packm_cxk_ri3.c => bli_packm_cxk_3m.c} | 60 +- ...bli_packm_cxk_ri3.h => bli_packm_cxk_3m.h} | 4 +- ...{bli_packm_cxk_ri.c => bli_packm_cxk_4m.c} | 34 +- ...{bli_packm_cxk_ri.h => bli_packm_cxk_4m.h} | 4 +- frame/1m/packm/bli_packm_gen_cxk.c | 46 +- frame/1m/packm/bli_packm_gen_cxk.h | 4 +- frame/1m/packm/bli_packm_herm_cxk.c | 84 +-- frame/1m/packm/bli_packm_herm_cxk.h | 4 +- frame/1m/packm/bli_packm_tri_cxk.c | 58 +- frame/1m/packm/bli_packm_tri_cxk.h | 4 +- ...m_ref_cxk_ri3.c => bli_packm_ref_cxk_3m.c} | 672 +++++++++--------- ...km_ref_cxk_ri.h => bli_packm_ref_cxk_3m.h} | 16 +- ...km_ref_cxk_ri.c => bli_packm_ref_cxk_4m.c} | 16 +- ...m_ref_cxk_ri3.h => bli_packm_ref_cxk_4m.h} | 16 +- frame/include/bli_kernel_3m_macro_defs.h | 80 +-- frame/include/bli_kernel_4m_macro_defs.h | 80 +-- frame/include/bli_kernel_pre_macro_defs.h | 96 +-- 20 files changed, 679 insertions(+), 679 deletions(-) rename frame/1m/packm/{bli_packm_cxk_ri3.c => bli_packm_cxk_3m.c} (80%) rename frame/1m/packm/{bli_packm_cxk_ri3.h => bli_packm_cxk_3m.h} (96%) rename frame/1m/packm/{bli_packm_cxk_ri.c => bli_packm_cxk_4m.c} (92%) rename frame/1m/packm/{bli_packm_cxk_ri.h => bli_packm_cxk_4m.h} (96%) rename frame/1m/packm/ukernels/{bli_packm_ref_cxk_ri3.c => bli_packm_ref_cxk_3m.c} (71%) rename frame/1m/packm/ukernels/{bli_packm_ref_cxk_ri.h => bli_packm_ref_cxk_3m.h} (85%) rename frame/1m/packm/ukernels/{bli_packm_ref_cxk_ri.c => bli_packm_ref_cxk_4m.c} (99%) rename frame/1m/packm/ukernels/{bli_packm_ref_cxk_ri3.h => bli_packm_ref_cxk_4m.h} (85%) diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index beda85df0..2a512626b 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -50,6 +50,6 @@ #include "bli_packm_tri_cxk.h" #include "bli_packm_cxk.h" -#include "bli_packm_cxk_ri.h" -#include "bli_packm_cxk_ri3.h" +#include "bli_packm_cxk_4m.h" +#include "bli_packm_cxk_3m.h" diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index f79b5a48e..996eba194 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -371,19 +371,19 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p ); \ + PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \ + diagoffp_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ } \ \ \ @@ -411,17 +411,17 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ - diagoffc_i, \ - uploc, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \ + diagoffc_i, \ + uploc, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ \ /* NOTE: This value is equivalent to (ps_p*3)/2. */ \ @@ -438,17 +438,17 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ - 0, \ - BLIS_DENSE, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + PASTEMAC(ch,packm_gen_cxk_3m)( BLIS_GENERAL, \ + 0, \ + BLIS_DENSE, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ \ /* NOTE: This value is equivalent to (ps_p*3)/2. */ \ diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index adcc92088..99de882d8 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -371,7 +371,7 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ + PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \ diagoffp_i, \ diagc, \ uploc, \ @@ -418,7 +418,7 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ + PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \ diagoffc_i, \ uploc, \ conjc, \ @@ -445,7 +445,7 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ + PASTEMAC(ch,packm_gen_cxk_4m)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ conjc, \ diff --git a/frame/1m/packm/bli_packm_cxk_ri3.c b/frame/1m/packm/bli_packm_cxk_3m.c similarity index 80% rename from frame/1m/packm/bli_packm_cxk_ri3.c rename to frame/1m/packm/bli_packm_cxk_3m.c index dfb6d4525..a4fa6417c 100644 --- a/frame/1m/packm/bli_packm_cxk_ri3.c +++ b/frame/1m/packm/bli_packm_cxk_3m.c @@ -60,9 +60,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 2 */ { NULL, - BLIS_CPACKM_2XK_RI3_KERNEL, + BLIS_CPACKM_2XK_3M_KERNEL, NULL, - BLIS_ZPACKM_2XK_RI3_KERNEL, + BLIS_ZPACKM_2XK_3M_KERNEL, }, /* panel width = 3 */ { @@ -71,9 +71,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 4 */ { NULL, - BLIS_CPACKM_4XK_RI3_KERNEL, + BLIS_CPACKM_4XK_3M_KERNEL, NULL, - BLIS_ZPACKM_4XK_RI3_KERNEL, + BLIS_ZPACKM_4XK_3M_KERNEL, }, /* panel width = 5 */ { @@ -82,9 +82,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 6 */ { NULL, - BLIS_CPACKM_6XK_RI3_KERNEL, + BLIS_CPACKM_6XK_3M_KERNEL, NULL, - BLIS_ZPACKM_6XK_RI3_KERNEL, + BLIS_ZPACKM_6XK_3M_KERNEL, }, /* panel width = 7 */ { @@ -93,9 +93,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 8 */ { NULL, - BLIS_CPACKM_8XK_RI3_KERNEL, + BLIS_CPACKM_8XK_3M_KERNEL, NULL, - BLIS_ZPACKM_8XK_RI3_KERNEL, + BLIS_ZPACKM_8XK_3M_KERNEL, }, /* panel width = 9 */ { @@ -104,9 +104,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 10 */ { NULL, - BLIS_CPACKM_10XK_RI3_KERNEL, + BLIS_CPACKM_10XK_3M_KERNEL, NULL, - BLIS_ZPACKM_10XK_RI3_KERNEL, + BLIS_ZPACKM_10XK_3M_KERNEL, }, /* panel width = 11 */ { @@ -115,9 +115,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 12 */ { NULL, - BLIS_CPACKM_12XK_RI3_KERNEL, + BLIS_CPACKM_12XK_3M_KERNEL, NULL, - BLIS_ZPACKM_12XK_RI3_KERNEL, + BLIS_ZPACKM_12XK_3M_KERNEL, }, /* panel width = 13 */ { @@ -126,9 +126,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 14 */ { NULL, - BLIS_CPACKM_14XK_RI3_KERNEL, + BLIS_CPACKM_14XK_3M_KERNEL, NULL, - BLIS_ZPACKM_14XK_RI3_KERNEL, + BLIS_ZPACKM_14XK_3M_KERNEL, }, /* panel width = 15 */ { @@ -137,9 +137,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 16 */ { NULL, - BLIS_CPACKM_16XK_RI3_KERNEL, + BLIS_CPACKM_16XK_3M_KERNEL, NULL, - BLIS_ZPACKM_16XK_RI3_KERNEL, + BLIS_ZPACKM_16XK_3M_KERNEL, }, /* panel width = 17 */ { @@ -197,7 +197,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict a_i = ( ctype_r* )a + 1; \ ctype_r* restrict p_r = ( ctype_r* )p; \ ctype_r* restrict p_i = ( ctype_r* )p + psp; \ - ctype_r* restrict p_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict p_rpi = ( ctype_r* )p + 2*psp; \ dim_t inca2 = 2*inca; \ dim_t lda2 = 2*lda; \ \ @@ -212,11 +212,11 @@ void PASTEMAC(ch,varname)( \ { \ for ( i = 0; i < m; ++i ) \ { \ - ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ - ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ - ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ - ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ - ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_rpi = p_rpi + (i )*1 + (j )*ldp; \ \ PASTEMAC(ch,scal2jri3s)( *kappa_r, \ *kappa_i, \ @@ -224,7 +224,7 @@ void PASTEMAC(ch,varname)( \ *alpha11_i, \ *pi11_r, \ *pi11_i, \ - *pi11_ri ); \ + *pi11_rpi ); \ } \ } \ } \ @@ -234,11 +234,11 @@ void PASTEMAC(ch,varname)( \ { \ for ( i = 0; i < m; ++i ) \ { \ - ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ - ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ - ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ - ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ - ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_rpi = p_rpi + (i )*1 + (j )*ldp; \ \ PASTEMAC(ch,scal2ri3s)( *kappa_r, \ *kappa_i, \ @@ -246,12 +246,12 @@ void PASTEMAC(ch,varname)( \ *alpha11_i, \ *pi11_r, \ *pi11_i, \ - *pi11_ri ); \ + *pi11_rpi ); \ } \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_cxk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_cxk_ri3.h b/frame/1m/packm/bli_packm_cxk_3m.h similarity index 96% rename from frame/1m/packm/bli_packm_cxk_ri3.h rename to frame/1m/packm/bli_packm_cxk_3m.h index 0c3d95a10..4335bebf3 100644 --- a/frame/1m/packm/bli_packm_cxk_ri3.h +++ b/frame/1m/packm/bli_packm_cxk_3m.h @@ -32,7 +32,7 @@ */ -#include "bli_packm_ref_cxk_ri3.h" +#include "bli_packm_ref_cxk_3m.h" #undef GENTPROTCO @@ -47,5 +47,5 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_cxk_ri3 ) +INSERT_GENTPROTCO_BASIC( packm_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_cxk_ri.c b/frame/1m/packm/bli_packm_cxk_4m.c similarity index 92% rename from frame/1m/packm/bli_packm_cxk_ri.c rename to frame/1m/packm/bli_packm_cxk_4m.c index 4a6dc6a63..c07081d77 100644 --- a/frame/1m/packm/bli_packm_cxk_ri.c +++ b/frame/1m/packm/bli_packm_cxk_4m.c @@ -60,9 +60,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 2 */ { NULL, - BLIS_CPACKM_2XK_RI_KERNEL, + BLIS_CPACKM_2XK_4M_KERNEL, NULL, - BLIS_ZPACKM_2XK_RI_KERNEL, + BLIS_ZPACKM_2XK_4M_KERNEL, }, /* panel width = 3 */ { @@ -71,9 +71,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 4 */ { NULL, - BLIS_CPACKM_4XK_RI_KERNEL, + BLIS_CPACKM_4XK_4M_KERNEL, NULL, - BLIS_ZPACKM_4XK_RI_KERNEL, + BLIS_ZPACKM_4XK_4M_KERNEL, }, /* panel width = 5 */ { @@ -82,9 +82,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 6 */ { NULL, - BLIS_CPACKM_6XK_RI_KERNEL, + BLIS_CPACKM_6XK_4M_KERNEL, NULL, - BLIS_ZPACKM_6XK_RI_KERNEL, + BLIS_ZPACKM_6XK_4M_KERNEL, }, /* panel width = 7 */ { @@ -93,9 +93,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 8 */ { NULL, - BLIS_CPACKM_8XK_RI_KERNEL, + BLIS_CPACKM_8XK_4M_KERNEL, NULL, - BLIS_ZPACKM_8XK_RI_KERNEL, + BLIS_ZPACKM_8XK_4M_KERNEL, }, /* panel width = 9 */ { @@ -104,9 +104,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 10 */ { NULL, - BLIS_CPACKM_10XK_RI_KERNEL, + BLIS_CPACKM_10XK_4M_KERNEL, NULL, - BLIS_ZPACKM_10XK_RI_KERNEL, + BLIS_ZPACKM_10XK_4M_KERNEL, }, /* panel width = 11 */ { @@ -115,9 +115,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 12 */ { NULL, - BLIS_CPACKM_12XK_RI_KERNEL, + BLIS_CPACKM_12XK_4M_KERNEL, NULL, - BLIS_ZPACKM_12XK_RI_KERNEL, + BLIS_ZPACKM_12XK_4M_KERNEL, }, /* panel width = 13 */ { @@ -126,9 +126,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 14 */ { NULL, - BLIS_CPACKM_14XK_RI_KERNEL, + BLIS_CPACKM_14XK_4M_KERNEL, NULL, - BLIS_ZPACKM_14XK_RI_KERNEL, + BLIS_ZPACKM_14XK_4M_KERNEL, }, /* panel width = 15 */ { @@ -137,9 +137,9 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = /* panel width = 16 */ { NULL, - BLIS_CPACKM_16XK_RI_KERNEL, + BLIS_CPACKM_16XK_4M_KERNEL, NULL, - BLIS_ZPACKM_16XK_RI_KERNEL, + BLIS_ZPACKM_16XK_4M_KERNEL, }, /* panel width = 17 */ { @@ -249,5 +249,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_cxk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_cxk_4m ) diff --git a/frame/1m/packm/bli_packm_cxk_ri.h b/frame/1m/packm/bli_packm_cxk_4m.h similarity index 96% rename from frame/1m/packm/bli_packm_cxk_ri.h rename to frame/1m/packm/bli_packm_cxk_4m.h index ef653d2ec..40c0384dd 100644 --- a/frame/1m/packm/bli_packm_cxk_ri.h +++ b/frame/1m/packm/bli_packm_cxk_4m.h @@ -32,7 +32,7 @@ */ -#include "bli_packm_ref_cxk_ri.h" +#include "bli_packm_ref_cxk_4m.h" #undef GENTPROTCO @@ -47,5 +47,5 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_cxk_ri ) +INSERT_GENTPROTCO_BASIC( packm_cxk_4m ) diff --git a/frame/1m/packm/bli_packm_gen_cxk.c b/frame/1m/packm/bli_packm_gen_cxk.c index 198946e1a..022d864cf 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.c +++ b/frame/1m/packm/bli_packm_gen_cxk.c @@ -192,7 +192,7 @@ void PASTEMAC(ch,varname)( \ \ \ /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_ri)( conjc, \ + PASTEMAC(ch,packm_cxk_4m)( conjc, \ panel_dim, \ panel_len, \ kappa, \ @@ -256,7 +256,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_4m ) @@ -317,12 +317,12 @@ void PASTEMAC(ch,varname)( \ \ \ /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_ri3)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ + PASTEMAC(ch,packm_cxk_3m)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ \ \ /* The packed memory region was acquired/allocated with "aligned" @@ -334,12 +334,12 @@ void PASTEMAC(ch,varname)( \ different register blockings for the edge cases. */ \ if ( m_panel != m_panel_max ) \ { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -361,17 +361,17 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -393,9 +393,9 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_gen_cxk.h b/frame/1m/packm/bli_packm_gen_cxk.h index 6065675d5..7ffef14fa 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.h +++ b/frame/1m/packm/bli_packm_gen_cxk.h @@ -70,6 +70,6 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri ) +INSERT_GENTPROTCO_BASIC( packm_gen_cxk_4m ) -INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri3 ) +INSERT_GENTPROTCO_BASIC( packm_gen_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_herm_cxk.c b/frame/1m/packm/bli_packm_herm_cxk.c index e1bec2814..2d0893209 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.c +++ b/frame/1m/packm/bli_packm_herm_cxk.c @@ -396,7 +396,7 @@ void PASTEMAC(ch,varname)( \ } \ \ /* Pack the full panel. */ \ - PASTEMAC(ch,packm_cxk_ri)( conjc, \ + PASTEMAC(ch,packm_cxk_4m)( conjc, \ panel_dim, \ panel_len, \ kappa, \ @@ -470,7 +470,7 @@ void PASTEMAC(ch,varname)( \ \ /* Pack to p10. For upper storage, this includes the unstored triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_ri)( conjc10, \ + PASTEMAC(ch,packm_cxk_4m)( conjc10, \ p10_dim, \ p10_len, \ kappa, \ @@ -479,7 +479,7 @@ void PASTEMAC(ch,varname)( \ \ /* Pack to p12. For lower storage, this includes the unstored triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_ri)( conjc12, \ + PASTEMAC(ch,packm_cxk_4m)( conjc12, \ p12_dim, \ p12_len, \ kappa, \ @@ -627,7 +627,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_4m ) @@ -732,12 +732,12 @@ void PASTEMAC(ch,varname)( \ } \ \ /* Pack the full panel. */ \ - PASTEMAC(ch,packm_cxk_ri3)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ + PASTEMAC(ch,packm_cxk_3m)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ } \ else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ { \ @@ -806,21 +806,21 @@ void PASTEMAC(ch,varname)( \ \ /* Pack to p10. For upper storage, this includes the unstored triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_ri3)( conjc10, \ - p10_dim, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, psp, ldp ); \ + PASTEMAC(ch,packm_cxk_3m)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, psp, ldp ); \ \ /* Pack to p12. For lower storage, this includes the unstored triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_ri3)( conjc12, \ - p12_dim, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, psp, ldp ); \ + PASTEMAC(ch,packm_cxk_3m)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, psp, ldp ); \ \ /* Pack the stored triangle of c11 to p11. */ \ { \ @@ -902,18 +902,18 @@ void PASTEMAC(ch,varname)( \ /* Update the p11 section of the ri panel. It simply needs to contain the sum of p11_r + p11_i. */ \ { \ - ctype_r* p11_ri = p11_i + psp; \ + ctype_r* p11_rpi = p11_i + psp; \ \ for ( j = 0; j < p11_n; ++j ) \ for ( i = 0; i < p11_m; ++i ) \ { \ - ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \ - ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \ - ctype_r* pi11_ri = p11_ri + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_rpi = p11_rpi + (i )*rs_p11 + (j )*cs_p11; \ \ PASTEMAC(chr,add3s)( *pi11_r, \ *pi11_i, \ - *pi11_ri ); \ + *pi11_rpi ); \ } \ } \ /* @@ -934,12 +934,12 @@ void PASTEMAC(ch,varname)( \ different register blockings for the edge cases. */ \ if ( m_panel != m_panel_max ) \ { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -961,17 +961,17 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -993,9 +993,9 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_herm_cxk.h b/frame/1m/packm/bli_packm_herm_cxk.h index e70cbabb2..b574f689e 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.h +++ b/frame/1m/packm/bli_packm_herm_cxk.h @@ -70,6 +70,6 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri ) +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m ) -INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri3 ) +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_tri_cxk.c b/frame/1m/packm/bli_packm_tri_cxk.c index 12d577436..1c2236f60 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.c +++ b/frame/1m/packm/bli_packm_tri_cxk.c @@ -284,7 +284,7 @@ void PASTEMAC(ch,varname)( \ \ \ /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_ri)( conjc, \ + PASTEMAC(ch,packm_cxk_4m)( conjc, \ panel_dim, \ panel_len, \ kappa, \ @@ -449,7 +449,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_4m ) @@ -520,20 +520,20 @@ void PASTEMAC(ch,varname)( \ \ \ /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_ri3)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ + PASTEMAC(ch,packm_cxk_3m)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ \ \ /* Tweak the panel according to its triangular structure */ \ { \ - dim_t j = bli_abs( diagoffp ); \ - ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ - ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ - ctype_r* p11_ri = ( ctype_r* )p + 2*psp + (j )*ldp; \ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ + ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ + ctype_r* p11_rpi = ( ctype_r* )p + 2*psp + (j )*ldp; \ \ /* If the diagonal of c is implicitly unit, explicitly set the the diagonal of the packed panel to kappa. */ \ @@ -556,7 +556,7 @@ void PASTEMAC(ch,varname)( \ m_panel, \ n_panel, \ &kappa_r, \ - p11_ri, rs_p11, cs_p11 ); \ + p11_rpi, rs_p11, cs_p11 ); \ } \ \ /* If requested, invert the diagonal of the packed panel. Note @@ -609,7 +609,7 @@ void PASTEMAC(ch,varname)( \ panel_dim, \ panel_dim, \ zero_r, \ - p11_ri, rs_p11, cs_p11 ); \ + p11_rpi, rs_p11, cs_p11 ); \ } \ } \ \ @@ -623,12 +623,12 @@ void PASTEMAC(ch,varname)( \ different register blockings for the edge cases. */ \ if ( m_panel != m_panel_max ) \ { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -650,17 +650,17 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ \ PASTEMAC(chr,setm)( 0, \ BLIS_NONUNIT_DIAG, \ @@ -682,7 +682,7 @@ void PASTEMAC(ch,varname)( \ m_edge, \ n_edge, \ zero_r, \ - p_edge_ri, rs_p, cs_p ); \ + p_edge_rpi, rs_p, cs_p ); \ } \ \ /* If this panel is an edge case in both panel dimension and length, @@ -716,5 +716,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_tri_cxk.h b/frame/1m/packm/bli_packm_tri_cxk.h index 8db7f242b..63756cd34 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.h +++ b/frame/1m/packm/bli_packm_tri_cxk.h @@ -73,7 +73,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri ) +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m ) -INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri3 ) +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c similarity index 71% rename from frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c index 11253d910..a3595e4a7 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -63,28 +63,28 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -94,34 +94,34 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_3m ) @@ -146,7 +146,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -154,32 +154,32 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -189,38 +189,38 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_3m ) @@ -245,7 +245,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -253,36 +253,36 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -292,42 +292,42 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_3m ) @@ -352,7 +352,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -360,40 +360,40 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -403,46 +403,46 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_3m ) @@ -467,7 +467,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -475,44 +475,44 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -522,50 +522,50 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_3m ) @@ -590,7 +590,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -598,48 +598,48 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -649,54 +649,54 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_3m ) @@ -721,7 +721,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -729,52 +729,52 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -784,58 +784,58 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_3m ) @@ -860,7 +860,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ \ if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ { \ @@ -868,56 +868,56 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_rpi +14) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_rpi +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_rpi +14) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_rpi +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ @@ -927,60 +927,60 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_rpi +14) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_rpi +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_rpi + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_rpi + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_rpi + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_rpi + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_rpi + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_rpi + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_rpi + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_rpi + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_rpi + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_rpi + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_rpi +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_rpi +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_rpi +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_rpi +13) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_rpi +14) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_rpi +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp; \ pi1_i += ldp; \ - pi1_ri += ldp; \ + pi1_rpi += ldp; \ } \ } \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri3 ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_3m ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h similarity index 85% rename from frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h index 920f0023f..c483c8614 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h @@ -43,12 +43,12 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROT_BASIC( packm_ref_2xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_4xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_6xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_8xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_10xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_12xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_14xk_ri ) -INSERT_GENTPROT_BASIC( packm_ref_16xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_2xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_3m ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_3m ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c similarity index 99% rename from frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c index bd88c0941..e5a83ea84 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c @@ -116,7 +116,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_4m ) @@ -210,7 +210,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_4m ) @@ -312,7 +312,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_4m ) @@ -422,7 +422,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_4m ) @@ -540,7 +540,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_4m ) @@ -666,7 +666,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_4m ) @@ -800,7 +800,7 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_4m ) @@ -942,5 +942,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri ) +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_4m ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h similarity index 85% rename from frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h index 2800291c8..7e5f77bd7 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h @@ -43,12 +43,12 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROT_BASIC( packm_ref_2xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_4xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_6xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_8xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_10xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_12xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_14xk_ri3 ) -INSERT_GENTPROT_BASIC( packm_ref_16xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_2xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_4m ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_4m ) diff --git a/frame/include/bli_kernel_3m_macro_defs.h b/frame/include/bli_kernel_3m_macro_defs.h index b21896d52..afa120e43 100644 --- a/frame/include/bli_kernel_3m_macro_defs.h +++ b/frame/include/bli_kernel_3m_macro_defs.h @@ -106,84 +106,84 @@ // Level-1m // -// packm_2xk_ri3 kernels +// packm_2xk_3m kernels -#ifndef BLIS_CPACKM_2XK_RI3_KERNEL -#define BLIS_CPACKM_2XK_RI3_KERNEL BLIS_CPACKM_2XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_2XK_3M_KERNEL +#define BLIS_CPACKM_2XK_3M_KERNEL BLIS_CPACKM_2XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_2XK_RI3_KERNEL -#define BLIS_ZPACKM_2XK_RI3_KERNEL BLIS_ZPACKM_2XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_2XK_3M_KERNEL +#define BLIS_ZPACKM_2XK_3M_KERNEL BLIS_ZPACKM_2XK_3M_KERNEL_REF #endif -// packm_4xk_ri3 kernels +// packm_4xk_3m kernels -#ifndef BLIS_CPACKM_4XK_RI3_KERNEL -#define BLIS_CPACKM_4XK_RI3_KERNEL BLIS_CPACKM_4XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_4XK_3M_KERNEL +#define BLIS_CPACKM_4XK_3M_KERNEL BLIS_CPACKM_4XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_4XK_RI3_KERNEL -#define BLIS_ZPACKM_4XK_RI3_KERNEL BLIS_ZPACKM_4XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_4XK_3M_KERNEL +#define BLIS_ZPACKM_4XK_3M_KERNEL BLIS_ZPACKM_4XK_3M_KERNEL_REF #endif -// packm_6xk_ri3 kernels +// packm_6xk_3m kernels -#ifndef BLIS_CPACKM_6XK_RI3_KERNEL -#define BLIS_CPACKM_6XK_RI3_KERNEL BLIS_CPACKM_6XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_6XK_3M_KERNEL +#define BLIS_CPACKM_6XK_3M_KERNEL BLIS_CPACKM_6XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_6XK_RI3_KERNEL -#define BLIS_ZPACKM_6XK_RI3_KERNEL BLIS_ZPACKM_6XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_6XK_3M_KERNEL +#define BLIS_ZPACKM_6XK_3M_KERNEL BLIS_ZPACKM_6XK_3M_KERNEL_REF #endif -// packm_8xk_ri3 kernels +// packm_8xk_3m kernels -#ifndef BLIS_CPACKM_8XK_RI3_KERNEL -#define BLIS_CPACKM_8XK_RI3_KERNEL BLIS_CPACKM_8XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_8XK_3M_KERNEL +#define BLIS_CPACKM_8XK_3M_KERNEL BLIS_CPACKM_8XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_8XK_RI3_KERNEL -#define BLIS_ZPACKM_8XK_RI3_KERNEL BLIS_ZPACKM_8XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_8XK_3M_KERNEL +#define BLIS_ZPACKM_8XK_3M_KERNEL BLIS_ZPACKM_8XK_3M_KERNEL_REF #endif -// packm_10xk_ri3 kernels +// packm_10xk_3m kernels -#ifndef BLIS_CPACKM_10XK_RI3_KERNEL -#define BLIS_CPACKM_10XK_RI3_KERNEL BLIS_CPACKM_10XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_10XK_3M_KERNEL +#define BLIS_CPACKM_10XK_3M_KERNEL BLIS_CPACKM_10XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_10XK_RI3_KERNEL -#define BLIS_ZPACKM_10XK_RI3_KERNEL BLIS_ZPACKM_10XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_10XK_3M_KERNEL +#define BLIS_ZPACKM_10XK_3M_KERNEL BLIS_ZPACKM_10XK_3M_KERNEL_REF #endif -// packm_12xk_ri3 kernels +// packm_12xk_3m kernels -#ifndef BLIS_CPACKM_12XK_RI3_KERNEL -#define BLIS_CPACKM_12XK_RI3_KERNEL BLIS_CPACKM_12XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_12XK_3M_KERNEL +#define BLIS_CPACKM_12XK_3M_KERNEL BLIS_CPACKM_12XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_12XK_RI3_KERNEL -#define BLIS_ZPACKM_12XK_RI3_KERNEL BLIS_ZPACKM_12XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_12XK_3M_KERNEL +#define BLIS_ZPACKM_12XK_3M_KERNEL BLIS_ZPACKM_12XK_3M_KERNEL_REF #endif -// packm_14xk_ri3 kernels +// packm_14xk_3m kernels -#ifndef BLIS_CPACKM_14XK_RI3_KERNEL -#define BLIS_CPACKM_14XK_RI3_KERNEL BLIS_CPACKM_14XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_14XK_3M_KERNEL +#define BLIS_CPACKM_14XK_3M_KERNEL BLIS_CPACKM_14XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_14XK_RI3_KERNEL -#define BLIS_ZPACKM_14XK_RI3_KERNEL BLIS_ZPACKM_14XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_14XK_3M_KERNEL +#define BLIS_ZPACKM_14XK_3M_KERNEL BLIS_ZPACKM_14XK_3M_KERNEL_REF #endif -// packm_16xk_ri3 kernels +// packm_16xk_3m kernels -#ifndef BLIS_CPACKM_16XK_RI3_KERNEL -#define BLIS_CPACKM_16XK_RI3_KERNEL BLIS_CPACKM_16XK_RI3_KERNEL_REF +#ifndef BLIS_CPACKM_16XK_3M_KERNEL +#define BLIS_CPACKM_16XK_3M_KERNEL BLIS_CPACKM_16XK_3M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_16XK_RI3_KERNEL -#define BLIS_ZPACKM_16XK_RI3_KERNEL BLIS_ZPACKM_16XK_RI3_KERNEL_REF +#ifndef BLIS_ZPACKM_16XK_3M_KERNEL +#define BLIS_ZPACKM_16XK_3M_KERNEL BLIS_ZPACKM_16XK_3M_KERNEL_REF #endif diff --git a/frame/include/bli_kernel_4m_macro_defs.h b/frame/include/bli_kernel_4m_macro_defs.h index 0cdb8ba39..29585e664 100644 --- a/frame/include/bli_kernel_4m_macro_defs.h +++ b/frame/include/bli_kernel_4m_macro_defs.h @@ -106,84 +106,84 @@ // Level-1m // -// packm_2xk_ri kernels +// packm_2xk_4m kernels -#ifndef BLIS_CPACKM_2XK_RI_KERNEL -#define BLIS_CPACKM_2XK_RI_KERNEL BLIS_CPACKM_2XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_2XK_4M_KERNEL +#define BLIS_CPACKM_2XK_4M_KERNEL BLIS_CPACKM_2XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_2XK_RI_KERNEL -#define BLIS_ZPACKM_2XK_RI_KERNEL BLIS_ZPACKM_2XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_2XK_4M_KERNEL +#define BLIS_ZPACKM_2XK_4M_KERNEL BLIS_ZPACKM_2XK_4M_KERNEL_REF #endif -// packm_4xk_ri kernels +// packm_4xk_4m kernels -#ifndef BLIS_CPACKM_4XK_RI_KERNEL -#define BLIS_CPACKM_4XK_RI_KERNEL BLIS_CPACKM_4XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_4XK_4M_KERNEL +#define BLIS_CPACKM_4XK_4M_KERNEL BLIS_CPACKM_4XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_4XK_RI_KERNEL -#define BLIS_ZPACKM_4XK_RI_KERNEL BLIS_ZPACKM_4XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_4XK_4M_KERNEL +#define BLIS_ZPACKM_4XK_4M_KERNEL BLIS_ZPACKM_4XK_4M_KERNEL_REF #endif -// packm_6xk_ri kernels +// packm_6xk_4m kernels -#ifndef BLIS_CPACKM_6XK_RI_KERNEL -#define BLIS_CPACKM_6XK_RI_KERNEL BLIS_CPACKM_6XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_6XK_4M_KERNEL +#define BLIS_CPACKM_6XK_4M_KERNEL BLIS_CPACKM_6XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_6XK_RI_KERNEL -#define BLIS_ZPACKM_6XK_RI_KERNEL BLIS_ZPACKM_6XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_6XK_4M_KERNEL +#define BLIS_ZPACKM_6XK_4M_KERNEL BLIS_ZPACKM_6XK_4M_KERNEL_REF #endif -// packm_8xk_ri kernels +// packm_8xk_4m kernels -#ifndef BLIS_CPACKM_8XK_RI_KERNEL -#define BLIS_CPACKM_8XK_RI_KERNEL BLIS_CPACKM_8XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_8XK_4M_KERNEL +#define BLIS_CPACKM_8XK_4M_KERNEL BLIS_CPACKM_8XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_8XK_RI_KERNEL -#define BLIS_ZPACKM_8XK_RI_KERNEL BLIS_ZPACKM_8XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_8XK_4M_KERNEL +#define BLIS_ZPACKM_8XK_4M_KERNEL BLIS_ZPACKM_8XK_4M_KERNEL_REF #endif -// packm_10xk_ri kernels +// packm_10xk_4m kernels -#ifndef BLIS_CPACKM_10XK_RI_KERNEL -#define BLIS_CPACKM_10XK_RI_KERNEL BLIS_CPACKM_10XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_10XK_4M_KERNEL +#define BLIS_CPACKM_10XK_4M_KERNEL BLIS_CPACKM_10XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_10XK_RI_KERNEL -#define BLIS_ZPACKM_10XK_RI_KERNEL BLIS_ZPACKM_10XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_10XK_4M_KERNEL +#define BLIS_ZPACKM_10XK_4M_KERNEL BLIS_ZPACKM_10XK_4M_KERNEL_REF #endif -// packm_12xk_ri kernels +// packm_12xk_4m kernels -#ifndef BLIS_CPACKM_12XK_RI_KERNEL -#define BLIS_CPACKM_12XK_RI_KERNEL BLIS_CPACKM_12XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_12XK_4M_KERNEL +#define BLIS_CPACKM_12XK_4M_KERNEL BLIS_CPACKM_12XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_12XK_RI_KERNEL -#define BLIS_ZPACKM_12XK_RI_KERNEL BLIS_ZPACKM_12XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_12XK_4M_KERNEL +#define BLIS_ZPACKM_12XK_4M_KERNEL BLIS_ZPACKM_12XK_4M_KERNEL_REF #endif -// packm_14xk_ri kernels +// packm_14xk_4m kernels -#ifndef BLIS_CPACKM_14XK_RI_KERNEL -#define BLIS_CPACKM_14XK_RI_KERNEL BLIS_CPACKM_14XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_14XK_4M_KERNEL +#define BLIS_CPACKM_14XK_4M_KERNEL BLIS_CPACKM_14XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_14XK_RI_KERNEL -#define BLIS_ZPACKM_14XK_RI_KERNEL BLIS_ZPACKM_14XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_14XK_4M_KERNEL +#define BLIS_ZPACKM_14XK_4M_KERNEL BLIS_ZPACKM_14XK_4M_KERNEL_REF #endif -// packm_16xk_ri kernels +// packm_16xk_4m kernels -#ifndef BLIS_CPACKM_16XK_RI_KERNEL -#define BLIS_CPACKM_16XK_RI_KERNEL BLIS_CPACKM_16XK_RI_KERNEL_REF +#ifndef BLIS_CPACKM_16XK_4M_KERNEL +#define BLIS_CPACKM_16XK_4M_KERNEL BLIS_CPACKM_16XK_4M_KERNEL_REF #endif -#ifndef BLIS_ZPACKM_16XK_RI_KERNEL -#define BLIS_ZPACKM_16XK_RI_KERNEL BLIS_ZPACKM_16XK_RI_KERNEL_REF +#ifndef BLIS_ZPACKM_16XK_4M_KERNEL +#define BLIS_ZPACKM_16XK_4M_KERNEL BLIS_ZPACKM_16XK_4M_KERNEL_REF #endif diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index 9c8bcadef..cfaae8d3c 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -194,85 +194,85 @@ #define BLIS_CPACKM_16XK_KERNEL_REF bli_cpackm_ref_16xk #define BLIS_ZPACKM_16XK_KERNEL_REF bli_zpackm_ref_16xk -// packm_2xk_ri kernels +// packm_2xk_4m kernels -#define BLIS_CPACKM_2XK_RI_KERNEL_REF bli_cpackm_ref_2xk_ri -#define BLIS_ZPACKM_2XK_RI_KERNEL_REF bli_zpackm_ref_2xk_ri +#define BLIS_CPACKM_2XK_4M_KERNEL_REF bli_cpackm_ref_2xk_4m +#define BLIS_ZPACKM_2XK_4M_KERNEL_REF bli_zpackm_ref_2xk_4m -// packm_4xk_ri kernels +// packm_4xk_4m kernels -#define BLIS_CPACKM_4XK_RI_KERNEL_REF bli_cpackm_ref_4xk_ri -#define BLIS_ZPACKM_4XK_RI_KERNEL_REF bli_zpackm_ref_4xk_ri +#define BLIS_CPACKM_4XK_4M_KERNEL_REF bli_cpackm_ref_4xk_4m +#define BLIS_ZPACKM_4XK_4M_KERNEL_REF bli_zpackm_ref_4xk_4m -// packm_6xk_ri kernels +// packm_6xk_4m kernels -#define BLIS_CPACKM_6XK_RI_KERNEL_REF bli_cpackm_ref_6xk_ri -#define BLIS_ZPACKM_6XK_RI_KERNEL_REF bli_zpackm_ref_6xk_ri +#define BLIS_CPACKM_6XK_4M_KERNEL_REF bli_cpackm_ref_6xk_4m +#define BLIS_ZPACKM_6XK_4M_KERNEL_REF bli_zpackm_ref_6xk_4m -// packm_8xk_ri kernels +// packm_8xk_4m kernels -#define BLIS_CPACKM_8XK_RI_KERNEL_REF bli_cpackm_ref_8xk_ri -#define BLIS_ZPACKM_8XK_RI_KERNEL_REF bli_zpackm_ref_8xk_ri +#define BLIS_CPACKM_8XK_4M_KERNEL_REF bli_cpackm_ref_8xk_4m +#define BLIS_ZPACKM_8XK_4M_KERNEL_REF bli_zpackm_ref_8xk_4m -// packm_10xk_ri kernels +// packm_10xk_4m kernels -#define BLIS_CPACKM_10XK_RI_KERNEL_REF bli_cpackm_ref_10xk_ri -#define BLIS_ZPACKM_10XK_RI_KERNEL_REF bli_zpackm_ref_10xk_ri +#define BLIS_CPACKM_10XK_4M_KERNEL_REF bli_cpackm_ref_10xk_4m +#define BLIS_ZPACKM_10XK_4M_KERNEL_REF bli_zpackm_ref_10xk_4m -// packm_12xk_ri kernels +// packm_12xk_4m kernels -#define BLIS_CPACKM_12XK_RI_KERNEL_REF bli_cpackm_ref_12xk_ri -#define BLIS_ZPACKM_12XK_RI_KERNEL_REF bli_zpackm_ref_12xk_ri +#define BLIS_CPACKM_12XK_4M_KERNEL_REF bli_cpackm_ref_12xk_4m +#define BLIS_ZPACKM_12XK_4M_KERNEL_REF bli_zpackm_ref_12xk_4m -// packm_14xk_ri kernels +// packm_14xk_4m kernels -#define BLIS_CPACKM_14XK_RI_KERNEL_REF bli_cpackm_ref_14xk_ri -#define BLIS_ZPACKM_14XK_RI_KERNEL_REF bli_zpackm_ref_14xk_ri +#define BLIS_CPACKM_14XK_4M_KERNEL_REF bli_cpackm_ref_14xk_4m +#define BLIS_ZPACKM_14XK_4M_KERNEL_REF bli_zpackm_ref_14xk_4m -// packm_16xk_ri kernels +// packm_16xk_4m kernels -#define BLIS_CPACKM_16XK_RI_KERNEL_REF bli_cpackm_ref_16xk_ri -#define BLIS_ZPACKM_16XK_RI_KERNEL_REF bli_zpackm_ref_16xk_ri +#define BLIS_CPACKM_16XK_4M_KERNEL_REF bli_cpackm_ref_16xk_4m +#define BLIS_ZPACKM_16XK_4M_KERNEL_REF bli_zpackm_ref_16xk_4m -// packm_2xk_ri3 kernels +// packm_2xk_3m kernels -#define BLIS_CPACKM_2XK_RI3_KERNEL_REF bli_cpackm_ref_2xk_ri3 -#define BLIS_ZPACKM_2XK_RI3_KERNEL_REF bli_zpackm_ref_2xk_ri3 +#define BLIS_CPACKM_2XK_3M_KERNEL_REF bli_cpackm_ref_2xk_3m +#define BLIS_ZPACKM_2XK_3M_KERNEL_REF bli_zpackm_ref_2xk_3m -// packm_4xk_ri3 kernels +// packm_4xk_3m kernels -#define BLIS_CPACKM_4XK_RI3_KERNEL_REF bli_cpackm_ref_4xk_ri3 -#define BLIS_ZPACKM_4XK_RI3_KERNEL_REF bli_zpackm_ref_4xk_ri3 +#define BLIS_CPACKM_4XK_3M_KERNEL_REF bli_cpackm_ref_4xk_3m +#define BLIS_ZPACKM_4XK_3M_KERNEL_REF bli_zpackm_ref_4xk_3m -// packm_6xk_ri3 kernels +// packm_6xk_3m kernels -#define BLIS_CPACKM_6XK_RI3_KERNEL_REF bli_cpackm_ref_6xk_ri3 -#define BLIS_ZPACKM_6XK_RI3_KERNEL_REF bli_zpackm_ref_6xk_ri3 +#define BLIS_CPACKM_6XK_3M_KERNEL_REF bli_cpackm_ref_6xk_3m +#define BLIS_ZPACKM_6XK_3M_KERNEL_REF bli_zpackm_ref_6xk_3m -// packm_8xk_ri3 kernels +// packm_8xk_3m kernels -#define BLIS_CPACKM_8XK_RI3_KERNEL_REF bli_cpackm_ref_8xk_ri3 -#define BLIS_ZPACKM_8XK_RI3_KERNEL_REF bli_zpackm_ref_8xk_ri3 +#define BLIS_CPACKM_8XK_3M_KERNEL_REF bli_cpackm_ref_8xk_3m +#define BLIS_ZPACKM_8XK_3M_KERNEL_REF bli_zpackm_ref_8xk_3m -// packm_10xk_ri3 kernels +// packm_10xk_3m kernels -#define BLIS_CPACKM_10XK_RI3_KERNEL_REF bli_cpackm_ref_10xk_ri3 -#define BLIS_ZPACKM_10XK_RI3_KERNEL_REF bli_zpackm_ref_10xk_ri3 +#define BLIS_CPACKM_10XK_3M_KERNEL_REF bli_cpackm_ref_10xk_3m +#define BLIS_ZPACKM_10XK_3M_KERNEL_REF bli_zpackm_ref_10xk_3m -// packm_12xk_ri3 kernels +// packm_12xk_3m kernels -#define BLIS_CPACKM_12XK_RI3_KERNEL_REF bli_cpackm_ref_12xk_ri3 -#define BLIS_ZPACKM_12XK_RI3_KERNEL_REF bli_zpackm_ref_12xk_ri3 +#define BLIS_CPACKM_12XK_3M_KERNEL_REF bli_cpackm_ref_12xk_3m +#define BLIS_ZPACKM_12XK_3M_KERNEL_REF bli_zpackm_ref_12xk_3m -// packm_14xk_ri3 kernels +// packm_14xk_3m kernels -#define BLIS_CPACKM_14XK_RI3_KERNEL_REF bli_cpackm_ref_14xk_ri3 -#define BLIS_ZPACKM_14XK_RI3_KERNEL_REF bli_zpackm_ref_14xk_ri3 +#define BLIS_CPACKM_14XK_3M_KERNEL_REF bli_cpackm_ref_14xk_3m +#define BLIS_ZPACKM_14XK_3M_KERNEL_REF bli_zpackm_ref_14xk_3m -// packm_16xk_ri3 kernels +// packm_16xk_3m kernels -#define BLIS_CPACKM_16XK_RI3_KERNEL_REF bli_cpackm_ref_16xk_ri3 -#define BLIS_ZPACKM_16XK_RI3_KERNEL_REF bli_zpackm_ref_16xk_ri3 +#define BLIS_CPACKM_16XK_3M_KERNEL_REF bli_cpackm_ref_16xk_3m +#define BLIS_ZPACKM_16XK_3M_KERNEL_REF bli_zpackm_ref_16xk_3m // unpack_2xk kernels