diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index dbdf6c9a0..4dbe7328f 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -46,4 +46,7 @@ #include "bli_packm_blk_var3.h" #include "bli_packm_cxk.h" +#include "bli_packm_gen_cxk.h" +#include "bli_packm_herm_cxk.h" +#include "bli_packm_tri_cxk.h" diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index 55c791f64..5f1ff0542 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -131,53 +131,29 @@ void PASTEMAC(ch,varname )( \ dim_t pd_p, inc_t ps_p \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ - ctype* restrict zero = PASTEMAC(ch,0); \ +\ ctype* restrict c_begin; \ ctype* restrict p_begin; \ \ dim_t iter_dim; \ dim_t num_iter; \ dim_t it, ic, ip; \ - dim_t i, j; \ +\ dim_t ic0, ip0; \ dim_t ic_inc, ip_inc; \ dim_t panel_dim; \ - dim_t panel_len; \ doff_t diagoffc_i; \ doff_t diagoffc_inc; \ - doff_t diagoffc_i_abs; \ +\ dim_t panel_dim_i; \ inc_t vs_c; \ - inc_t incc, ldc; \ - inc_t ldp; \ dim_t* m_panel; \ dim_t* n_panel; \ dim_t m_panel_max; \ dim_t n_panel_max; \ conj_t conjc; \ -\ - ctype* restrict c10; \ - ctype* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ -\ - ctype* restrict c12; \ - ctype* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ -\ - ctype* restrict c11; \ - ctype* restrict p11; \ - dim_t p11_m; \ - dim_t p11_n; \ - inc_t rs_p11, cs_p11; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ @@ -200,37 +176,25 @@ void PASTEMAC(ch,varname )( \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ - panel_len = m; \ panel_dim = pd_p; \ - incc = cs_c; \ - ldc = rs_c; \ vs_c = cs_c; \ diagoffc_inc = -( doff_t)panel_dim; \ - ldp = rs_p; \ m_panel = &m; \ n_panel = &panel_dim_i; \ m_panel_max = m_max; \ n_panel_max = panel_dim; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ } \ else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ - panel_len = n; \ panel_dim = pd_p; \ - incc = rs_c; \ - ldc = cs_c; \ vs_c = rs_c; \ diagoffc_inc = ( doff_t )panel_dim; \ - ldp = cs_p; \ m_panel = &panel_dim_i; \ n_panel = &n; \ m_panel_max = panel_dim; \ n_panel_max = n_max; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ } \ \ /* Compute the total number of iterations we'll need. */ \ @@ -256,216 +220,36 @@ void PASTEMAC(ch,varname )( \ \ /* If the current panel intersects the diagonal and C is either upper- or lower-stored, then we assume C is symmetric or - Hermitian and that it must be densified (note we don't even - bother passing in a densify parameter), in which case we pack - the panel in three stages. + Hermitian and that it must be densified. Otherwise, we pack the panel all at once. */ \ if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \ bli_is_upper_or_lower( uploc ) ) \ { \ - diagoffc_i_abs = bli_abs( diagoffc_i ); \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ - { \ - p10_dim = panel_dim_i; \ - p10_len = diagoffc_i_abs; \ - p10 = p_begin; \ - c10 = c_begin; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim_i; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_i_abs - j; \ - p12 = p_begin + (j )*ldp; \ - c12 = c_begin + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - p11_m = panel_dim_i; \ - p11_n = panel_dim_i; \ - j = diagoffc_i_abs; \ - p11 = p_begin + (j )*ldp; \ - c11 = c_begin + (j )*ldc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc12 ); \ - } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ - { \ - p10_dim = panel_dim_i; \ - p10_len = diagoffc_i_abs + panel_dim_i; \ - diagoffc10 = diagoffc_i; \ - p10 = p_begin; \ - c10 = c_begin; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim_i; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p_begin + (j )*ldp; \ - c12 = c_begin + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ -\ - p11_m = panel_dim_i; \ - p11_n = panel_dim_i; \ - j = diagoffc_i_abs; \ - p11 = p_begin + (j )*ldp; \ - c11 = c_begin + (j )*ldc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc10 ); \ - } \ -\ - /* Pack to P10. For upper storage, this includes the unstored - triangle of C11. */ \ - PASTEMAC(ch,packm_cxk)( conjc10, \ - p10_dim, \ - p10_len, \ - kappa_cast, \ - c10, incc10, ldc10, \ - p10, ldp ); \ -\ - /* Pack to P12. For lower storage, this includes the unstored - triangle of C11. */ \ - PASTEMAC(ch,packm_cxk)( conjc12, \ - p12_dim, \ - p12_len, \ - kappa_cast, \ - c12, incc12, ldc12, \ - p12, ldp ); \ -\ - /* Pack the stored triangule of C11 to P11. */ \ - PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - conjc, \ - p11_m, \ - p11_n, \ - kappa_cast, \ - c11, rs_c, cs_c, \ - p11, rs_p11, cs_p11 ); \ -\ - /* If source matrix C is Hermitian, we have to zero out the - imaginary components of the diagonal of P11 in case the - corresponding elements in C11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ - { \ - /* NOTE: We can directly increment p11 since we are done - using p11 for the remainder of the function. */ \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - PASTEMAC(ch,seti0s)( *p11 ); \ -\ - p11 += rs_p11 + cs_p11; \ - } \ - } \ + PASTEMAC(ch,packm_herm_cxk)( strucc, \ + diagoffc_i, \ + uploc, \ + conjc, \ + *m_panel, \ + *n_panel, \ + m_panel_max, \ + n_panel_max, \ + kappa, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ else \ { \ - /* Note that the following code executes if the current panel either: - - does not intersect the diagonal, or - - does intersect the diagonal, BUT the matrix is general - which means the entire current panel can be copied at once. */ \ -\ - /* We use some c10-specific variables here because we might need - to change them if the current panel is unstored. (The values - below are used if the current panel is stored.) */ \ - c10 = c_begin; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, and so we take conjugation into account. (Note - this implicitly assumes we are operating on a symmetric or - Hermitian matrix, since a general matrix would not contain - any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \ - { \ - c10 = c10 + diagoffc_i * ( doff_t )cs_c + \ - -diagoffc_i * ( doff_t )rs_c; \ - bli_swap_incs( incc10, ldc10 ); \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc10 ); \ - } \ -\ - /* Pack the current panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc10, \ - panel_dim_i, \ - panel_len, \ - kappa_cast, \ - c10, incc10, ldc10, \ - p_begin, ldp ); \ -\ -/* - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: c", panel_len, panel_dim_i, \ - c_begin, ldc, incc, "%5.2f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: p copied", panel_len, panel_dim_i, \ - p_begin, ldp, 1, "%5.2f", "" ); \ -*/ \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This simplifies the - register level micro-kernel in that it does not need to support - different register blockings for the edge cases. */ \ - if ( *m_panel != m_panel_max ) \ - { \ - dim_t i = *m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p_begin + (i )*rs_p; \ -\ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -\ - if ( *n_panel != n_panel_max ) \ - { \ - dim_t j = *n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p_begin + (j )*cs_p; \ -\ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,packm_gen_cxk)( strucc, \ + diagoffc_i, \ + uploc, \ + conjc, \ + *m_panel, \ + *n_panel, \ + m_panel_max, \ + n_panel_max, \ + kappa, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ \ /* @@ -474,7 +258,7 @@ void PASTEMAC(ch,varname )( \ p_begin, 1, cs_p, "%4.1f", "" ); \ if ( cs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \ - p_begin, panel_dim, 1, "%8.5f", "" ); \ + p_begin, rs_p, 1, "%4.1f", "" ); \ */ \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index 69ec20cbf..4a3eb8225 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -146,7 +146,6 @@ void PASTEMAC(ch,varname )( \ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict c_begin; \ ctype* restrict p_begin; \ \ @@ -155,7 +154,7 @@ void PASTEMAC(ch,varname )( \ dim_t it, ic, ip; \ dim_t ic0, ip0; \ doff_t ic_inc, ip_inc; \ - dim_t panel_dim; \ + dim_t panel_dim_max; \ dim_t panel_len; \ dim_t panel_len_max; \ doff_t diagoffc_i; \ @@ -165,10 +164,12 @@ void PASTEMAC(ch,varname )( \ dim_t panel_len_max_i; \ dim_t panel_off_i; \ inc_t vs_c; \ - inc_t incc, ldc; \ + inc_t ldc; \ inc_t ldp, p_inc; \ - dim_t* m_panel; \ - dim_t* n_panel; \ + dim_t* m_panel_full; \ + dim_t* n_panel_full; \ + dim_t* m_panel_max; \ + dim_t* n_panel_max; \ conj_t conjc; \ \ ctype* restrict c_use; \ @@ -203,16 +204,17 @@ void PASTEMAC(ch,varname )( \ iter_dim = n; \ panel_len = m; \ panel_len_max = m_max; \ - panel_dim = pd_p; \ - incc = cs_c; \ + panel_dim_max = pd_p; \ ldc = rs_c; \ vs_c = cs_c; \ - diagoffc_inc = -( doff_t)panel_dim; \ + diagoffc_inc = -( doff_t)panel_dim_max; \ ldp = rs_p; \ - m_panel = &m; \ - n_panel = &panel_dim_i; \ + m_panel_full = &m; \ + n_panel_full = &panel_dim_i; \ m_panel_use = &panel_len_i; \ n_panel_use = &panel_dim_i; \ + m_panel_max = &panel_len_max_i; \ + n_panel_max = &panel_dim_max; \ } \ else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ { \ @@ -220,35 +222,36 @@ void PASTEMAC(ch,varname )( \ iter_dim = m; \ panel_len = n; \ panel_len_max = n_max; \ - panel_dim = pd_p; \ - incc = rs_c; \ + panel_dim_max = pd_p; \ ldc = cs_c; \ vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim; \ + diagoffc_inc = ( doff_t )panel_dim_max; \ ldp = cs_p; \ - m_panel = &panel_dim_i; \ - n_panel = &n; \ + m_panel_full = &panel_dim_i; \ + n_panel_full = &n; \ m_panel_use = &panel_dim_i; \ n_panel_use = &panel_len_i; \ + m_panel_max = &panel_dim_max; \ + n_panel_max = &panel_len_max_i; \ } \ \ /* Compute the total number of iterations we'll need. */ \ - num_iter = iter_dim / panel_dim + ( iter_dim % panel_dim ? 1 : 0 ); \ + num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ /* Set the initial values and increments for indices related to C and P based on whether reverse iteration was requested. */ \ if ( ( revifup && bli_is_upper( uploc ) ) || \ ( reviflo && bli_is_lower( uploc ) ) ) \ { \ - ic0 = (num_iter - 1) * panel_dim; \ - ic_inc = -panel_dim; \ + ic0 = (num_iter - 1) * panel_dim_max; \ + ic_inc = -panel_dim_max; \ ip0 = num_iter - 1; \ ip_inc = -1; \ } \ else \ { \ ic0 = 0; \ - ic_inc = panel_dim; \ + ic_inc = panel_dim_max; \ ip0 = 0; \ ip_inc = 1; \ } \ @@ -258,7 +261,7 @@ void PASTEMAC(ch,varname )( \ for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ - panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ @@ -269,11 +272,11 @@ void PASTEMAC(ch,varname )( \ triangular), pack only as much as we need (ie: skip over as much as possible on the unstored side of the diagonal). Otherwise, we assume the current panel is full-length. */ \ - if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ { \ continue; \ } \ - else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \ + else if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \ bli_is_triangular( strucc ) ) \ { \ /* Sanity check. Diagonals should not intersect the short end of @@ -284,199 +287,73 @@ void PASTEMAC(ch,varname )( \ ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ panel_off_i = 0; \ - panel_len_i = bli_min( panel_len, -diagoffc_i + panel_dim_i ); \ - panel_len_max_i = bli_min( panel_len_max, -diagoffc_i + panel_dim ); \ + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ + panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ diagoffp = diagoffc_i; \ } \ - else if ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ - panel_off_i = bli_abs( bli_min( diagoffc_i, 0 ) ); \ + panel_off_i = bli_abs( diagoffc_i ); \ panel_len_i = panel_len - panel_off_i; \ panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp = diagoffc_i + panel_off_i; \ - } \ - else if ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ - { \ - panel_off_i = bli_max( diagoffc_i, 0 ); \ - panel_len_i = panel_len - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp = diagoffc_i - panel_off_i; \ - } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_min( panel_len, diagoffc_i + panel_dim_i ); \ - panel_len_max_i = bli_min( panel_len_max, diagoffc_i + panel_dim ); \ - diagoffp = diagoffc_i; \ + diagoffp = 0; \ } \ \ - /* Adjust the pointer to the beginning of the panel in C based on - the offset determined above. */ \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc, \ - panel_dim_i, \ - panel_len_i, \ - kappa_cast, \ - c_use, incc, ldc, \ - p_use, ldp ); \ + PASTEMAC(ch,packm_tri_cxk)( strucc, \ + diagoffp, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ \ - /* If the diagonal of C is implicitly unit, set the diagonal of - the packed panel to unit. */ \ - if ( bli_is_unit_diag( diagc ) ) \ - { \ - PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \ - *m_panel_use, \ - *n_panel_use, \ - kappa_cast, \ - p_use, rs_p, cs_p ); \ - } \ -\ - /* If requested, invert the diagonal of the packed panel. */ \ - if ( invdiag == TRUE ) \ - { \ - PASTEMAC(ch,invertd_unb_var1)( diagoffp, \ - *m_panel_use, \ - *n_panel_use, \ - p_use, rs_p, cs_p ); \ - } \ -\ - /* Always densify the unstored part of the packed panel. */ \ - { \ - uplo_t uplop = uploc; \ -\ - /* For triangular matrices, we wish to reference the region - strictly opposite the diagonal of C. This amounts to - toggling uploc and then shifting the diagonal offset to - shrink the stored region (by one diagonal). */ \ - bli_toggle_uplo( uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ -\ - /* Set the region opposite the diagonal of P to zero. */ \ - PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - *m_panel_use, \ - *n_panel_use, \ - zero, \ - p_use, rs_p, cs_p ); \ - } \ \ p_inc = ldp * panel_len_max_i; \ } \ else \ { \ - c_use = c_begin; \ - p_use = p_begin; \ -\ - /* Pack a full-length panel. */ \ - panel_off_i = 0; \ panel_len_i = panel_len; \ panel_len_max_i = panel_len_max; \ \ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc, \ - panel_dim_i, \ - panel_len_i, \ - kappa_cast, \ - c_use, incc, ldc, \ - p_use, ldp ); \ + PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ + 0, \ + BLIS_DENSE, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ \ p_inc = ldp * panel_len_max_i; \ } \ \ - /* If necessary, zero-pad at the edge of the panel dimension (ie: along - the long dimension of the panel). */ \ - if ( panel_dim_i != panel_dim ) \ - { \ - /* Note that this code does the right thing for both row and - column panels, since an m x n column-stored row panel and an - n x m row-stored column panel look the same in memory. */ \ - dim_t i = panel_dim_i; \ - dim_t m_edge = panel_dim - i; \ - dim_t n_edge = panel_len_max_i; \ - inc_t rs_pe = 1; \ - inc_t cs_pe = ldp; \ - ctype* p_edge = p_begin + (i )*rs_pe; \ -\ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_pe, cs_pe ); \ - } \ -\ - /* If necessary, zero-pad at the far end of the panel (ie: at the - other side of the long dimension of the panel). */ \ - if ( panel_len_i != panel_len_max_i ) \ - { \ - /* Note that this code does the right thing for both row and - column panels, since an m x n column-stored row panel and an - n x m row-stored column panel look the same in memory. */ \ - /* Note that we set m_edge as panel_dim, and not panel_dim_i; - this is so that we can simultaneously zero out the corner - region (if it exists). */ \ - dim_t j = panel_len_i; \ - dim_t m_edge = panel_dim; \ - dim_t n_edge = panel_len_max_i - j; \ - inc_t rs_pe = 1; \ - inc_t cs_pe = ldp; \ - ctype* p_edge = p_begin + (j )*cs_pe; \ -\ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_pe, cs_pe ); \ - } \ -\ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. */ \ - if ( panel_dim_i != panel_dim && \ - panel_len_i != panel_len_max_i ) \ - { \ - /* Note that this code does the right thing for both row and - column panels, since an m x n column-stored row panel and an - n x m row-stored column panel look the same in memory. */ \ - dim_t i = panel_dim_i; \ - dim_t j = panel_len_i; \ - dim_t m_br = panel_dim - i; \ - dim_t n_br = panel_len_max_i - j; \ - inc_t rs_pe = 1; \ - inc_t cs_pe = ldp; \ - ctype* one = PASTEMAC(ch,1); \ - ctype* p_edge = p_begin + (i )*rs_pe + (j )*cs_pe; \ -\ - PASTEMAC2(ch,ch,setd_unb_var1)( 0, \ - m_br, \ - n_br, \ - one, \ - p_edge, rs_pe, cs_pe ); \ -\ -/* - PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \ - p_edge, rs_pe, cs_pe, "%5.2f", "" ); \ -*/ \ - } \ \ /* if ( rs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", panel_dim, panel_len_max_i, \ + PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", panel_dim_max, panel_len_max_i, \ p_begin, rs_p, cs_p, "%4.1f", "" ); \ if ( cs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", panel_len_max_i, panel_dim, \ + PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", panel_len_max_i, panel_dim_max, \ p_begin, rs_p, cs_p, "%4.1f", "" ); \ */ \ +\ \ p_begin += p_inc; \ } \ diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/packm/bli_packm_cxk.h index 800e1e137..60a88657e 100644 --- a/frame/1m/packm/bli_packm_cxk.h +++ b/frame/1m/packm/bli_packm_cxk.h @@ -52,7 +52,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* beta, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ + void* p, inc_t ldp \ ); INSERT_GENTPROT_BASIC( packm_cxk ) diff --git a/frame/1m/packm/bli_packm_gen_cxk.c b/frame/1m/packm/bli_packm_gen_cxk.c new file mode 100644 index 000000000..ec5e9d1d1 --- /dev/null +++ b/frame/1m/packm/bli_packm_gen_cxk.c @@ -0,0 +1,154 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype* restrict c_begin = c; \ + ctype* restrict p_begin = p; \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c_begin = c_begin + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk)( conjc, \ + panel_dim, \ + panel_len, \ + kappa_cast, \ + c_begin, incc, ldc, \ + p_begin, ldp ); \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype* p_edge = p_begin + (i )*rs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype* p_edge = p_begin + (j )*cs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ +} + +INSERT_GENTFUNC_BASIC0( packm_gen_cxk ) + diff --git a/frame/1m/packm/bli_packm_gen_cxk.h b/frame/1m/packm/bli_packm_gen_cxk.h new file mode 100644 index 000000000..45d765a4e --- /dev/null +++ b/frame/1m/packm/bli_packm_gen_cxk.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROT_BASIC( packm_gen_cxk ) + diff --git a/frame/1m/packm/bli_packm_herm_cxk.c b/frame/1m/packm/bli_packm_herm_cxk.c new file mode 100644 index 000000000..876268868 --- /dev/null +++ b/frame/1m/packm/bli_packm_herm_cxk.c @@ -0,0 +1,274 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype* restrict c_begin = c; \ + ctype* restrict p_begin = p; \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + dim_t i, j; \ + dim_t panel_len; \ + doff_t diagoffc_abs; \ + dim_t panel_dim; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ + ctype* restrict c10; \ + ctype* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + ctype* restrict c11; \ + ctype* restrict p11; \ + dim_t p11_m; \ + dim_t p11_n; \ + inc_t rs_p11, cs_p11; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_len = m_panel; \ + panel_dim = n_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_len = n_panel; \ + panel_dim = m_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ + } \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_begin; \ + c10 = c_begin; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_begin + (j )*ldp; \ + c12 = c_begin + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + p11_m = panel_dim; \ + p11_n = panel_dim; \ + j = diagoffc_abs; \ + p11 = p_begin + (j )*ldp; \ + c11 = c_begin + (j )*ldc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_begin; \ + c10 = c_begin; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_begin + (j )*ldp; \ + c12 = c_begin + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + p11_m = panel_dim; \ + p11_n = panel_dim; \ + j = diagoffc_abs; \ + p11 = p_begin + (j )*ldp; \ + c11 = c_begin + (j )*ldc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to P10. For upper storage, this includes the unstored + triangle of C11. */ \ + PASTEMAC(ch,packm_cxk)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa_cast, \ + c10, incc10, ldc10, \ + p10, ldp ); \ +\ + /* Pack to P12. For lower storage, this includes the unstored + triangle of C11. */ \ + PASTEMAC(ch,packm_cxk)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa_cast, \ + c12, incc12, ldc12, \ + p12, ldp ); \ +\ + /* Pack the stored triangule of C11 to P11. */ \ + PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + conjc, \ + p11_m, \ + p11_n, \ + kappa_cast, \ + c11, rs_c, cs_c, \ + p11, rs_p11, cs_p11 ); \ +\ + /* If source matrix C is Hermitian, we have to zero out the + imaginary components of the diagonal of P11 in case the + corresponding elements in C11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + /* NOTE: We can directly increment p11 since we are done + using p11 for the remainder of the function. */ \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + PASTEMAC(ch,seti0s)( *p11 ); \ +\ + p11 += rs_p11 + cs_p11; \ + } \ + } \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype* p_edge = p_begin + (i )*rs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype* p_edge = p_begin + (j )*cs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ +} + +INSERT_GENTFUNC_BASIC0( packm_herm_cxk ) + diff --git a/frame/1m/packm/bli_packm_herm_cxk.h b/frame/1m/packm/bli_packm_herm_cxk.h new file mode 100644 index 000000000..209f49d27 --- /dev/null +++ b/frame/1m/packm/bli_packm_herm_cxk.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROT_BASIC( packm_herm_cxk ) + diff --git a/frame/1m/packm/bli_packm_tri_cxk.c b/frame/1m/packm/bli_packm_tri_cxk.c new file mode 100644 index 000000000..f72ab99e5 --- /dev/null +++ b/frame/1m/packm/bli_packm_tri_cxk.c @@ -0,0 +1,210 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype* restrict c_begin = c; \ + ctype* restrict p_begin = p; \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk)( conjc, \ + panel_dim, \ + panel_len, \ + kappa_cast, \ + c_begin, incc, ldc, \ + p_begin, ldp ); \ +\ + /* If the diagonal of C is implicitly unit, set the diagonal of + the packed panel to unit. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \ + m_panel, \ + n_panel, \ + kappa_cast, \ + p_begin, rs_p, cs_p ); \ + } \ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + PASTEMAC(ch,invertd_unb_var1)( diagoffp, \ + m_panel, \ + n_panel, \ + p_begin, rs_p, cs_p ); \ + } \ +\ + /* Set the region opposite the diagonal of P to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). */ \ + { \ + uplo_t uplop = uploc; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \ + BLIS_NONUNIT_DIAG, \ + uplop, \ + m_panel, \ + n_panel, \ + zero, \ + p_begin, rs_p, cs_p ); \ + } \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype* p_edge = p_begin + (i )*rs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype* p_edge = p_begin + (j )*cs_p; \ +\ + PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype* one = PASTEMAC(ch,1); \ + ctype* p_edge = p_begin + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC2(ch,ch,setd_unb_var1)( 0, \ + m_br, \ + n_br, \ + one, \ + p_edge, rs_p, cs_p ); \ +/* + PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \ + p_edge, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ +/* + if ( rs_p == 1 ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", m_panel_max, n_panel_max, \ + p_begin, rs_p, cs_p, "%4.1f", "" ); \ + if ( cs_p == 1 ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", m_panel_max, n_panel_max, \ + p_begin, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( packm_tri_cxk ) + diff --git a/frame/1m/packm/bli_packm_tri_cxk.h b/frame/1m/packm/bli_packm_tri_cxk.h new file mode 100644 index 000000000..724d31642 --- /dev/null +++ b/frame/1m/packm/bli_packm_tri_cxk.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROT_BASIC( packm_tri_cxk ) +