mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Perform her2k var1 loops in sequence.
Details: - Changed variant 1 of her2k so that the two rank-k products are computed and accumulated in sequence rather than fused into one loop. This is necessary if BLIS is to be configured to provide only enough contiguous memory for one panel of B.
This commit is contained in:
@@ -71,17 +71,85 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize objects for packing B' and A'.
|
||||
//
|
||||
// Perform first rank-k update: C = C + alpha * A * B'.
|
||||
//
|
||||
|
||||
// Initialize object for packing B'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Bh_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &bh_pack, &bhL_pack );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha,
|
||||
&a1_pack,
|
||||
&bhL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
|
||||
//
|
||||
// Perform second rank-k update: C = C + conj(alpha) * B * A'.
|
||||
//
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
ah, &ah_pack,
|
||||
@@ -91,43 +159,32 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
// Acquire partitions for B1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Ah_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &bh_pack, &bhL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
// Initialize objects for packing B1 and C1.
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
@@ -138,16 +195,13 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform her2k subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhL_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha_conj,
|
||||
&b1_pack,
|
||||
&ahL_pack,
|
||||
&BLIS_ONE,
|
||||
&c1L_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
@@ -156,8 +210,6 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
|
||||
@@ -71,17 +71,84 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize objects for packing B' and A'.
|
||||
//
|
||||
// Perform first rank-k update: C = C + alpha * A * B'.
|
||||
//
|
||||
|
||||
// Initialize object for packing B'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Bh_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
|
||||
nR = bl2_obj_width_after_trans( c1 ) - offR;
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &c1, &c1R );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &bh_pack, &bhR_pack );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha,
|
||||
&a1_pack,
|
||||
&bhR_pack,
|
||||
beta,
|
||||
&c1R_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1R_pack, &c1R,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
|
||||
//
|
||||
// Perform second rank-k update: C = C + conj(alpha) * B * A'.
|
||||
//
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
ah, &ah_pack,
|
||||
@@ -91,42 +158,31 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bl2_determine_blocksize_f( i, m_trans, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1, and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
// Acquire partitions for B1 and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Ah_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
|
||||
nR = bl2_obj_width_after_trans( c1 ) - offR;
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &c1, &c1R );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &bh_pack, &bhR_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &ah_pack, &ahR_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
// Initialize objects for packing B1 and C1.
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
@@ -137,16 +193,13 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
|
||||
&c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform her2k subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhR_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahR_pack,
|
||||
beta,
|
||||
&c1R_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha_conj,
|
||||
&b1_pack,
|
||||
&ahR_pack,
|
||||
&BLIS_ONE,
|
||||
&c1R_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1R_pack, &c1R,
|
||||
@@ -155,8 +208,6 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1R_pack );
|
||||
|
||||
Reference in New Issue
Block a user