diff --git a/frame/3/her2k/bl2_her2k_l_blk_var1.c b/frame/3/her2k/bl2_her2k_l_blk_var1.c index cc86f2327..863826134 100644 --- a/frame/3/her2k/bl2_her2k_l_blk_var1.c +++ b/frame/3/her2k/bl2_her2k_l_blk_var1.c @@ -71,17 +71,85 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, c, cntl_sub_scalm( cntl ) ); - // Initialize objects for packing B' and A'. + // + // Perform first rank-k update: C = C + alpha * A * B'. + // + + // Initialize object for packing B'. bl2_packm_init( bh, &bh_pack, cntl_sub_packm_b( cntl ) ); - bl2_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); // Pack B' and scale by alpha (if instructed). bl2_packm_int( alpha, bh, &bh_pack, cntl_sub_packm_b( cntl ) ); + // Partition along the m dimension. + for ( i = 0; i < m_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bl2_determine_blocksize_b( i, m_trans, a, + cntl_blocksize( cntl ) ); + + // Acquire partitions for A1 and C1. + bl2_acquire_mpart_b2t( BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bl2_acquire_mpart_b2t( BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Partition off the stored region of C1 and the corresponding region + // of Bh_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. + offL = 0; + nL = bl2_min( bl2_obj_width_after_trans( c1 ), + bl2_obj_diag_offset_after_trans( c1 ) + b_alg ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offL, nL, &c1, &c1L ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offL, nL, &bh_pack, &bhL_pack ); + + // Initialize objects for packing A1 and C1. + bl2_packm_init( &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + bl2_packm_init( &c1L, &c1L_pack, + cntl_sub_packm_c( cntl ) ); + + // Pack A1 and scale by alpha (if instructed). + bl2_packm_int( alpha, + &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + + // Pack C1 and scale by beta (if instructed). + bl2_packm_int( beta, + &c1L, &c1L_pack, + cntl_sub_packm_c( cntl ) ); + + // Perform herk subproblem. + bl2_herk_int( alpha, + &a1_pack, + &bhL_pack, + beta, + &c1L_pack, + cntl_sub_herk( cntl ) ); + + // Unpack C1 (if C1 was packed). + bl2_unpackm_int( &c1L_pack, &c1L, + cntl_sub_unpackm_c( cntl ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bl2_obj_release_pack( &a1_pack ); + bl2_obj_release_pack( &bh_pack ); + + // + // Perform second rank-k update: C = C + conj(alpha) * B * A'. + // + + // Initialize object for packing A'. + bl2_packm_init( ah, &ah_pack, + cntl_sub_packm_b( cntl ) ); + // Pack A' and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, ah, &ah_pack, @@ -91,43 +159,32 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bl2_determine_blocksize_b( i, m_trans, a, + b_alg = bl2_determine_blocksize_b( i, m_trans, b, cntl_blocksize( cntl ) ); - // Acquire partitions for A1, B1 and C1. - bl2_acquire_mpart_b2t( BLIS_SUBPART1, - i, b_alg, a, &a1 ); + // Acquire partitions for B1 and C1. bl2_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, b, &b1 ); bl2_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Partition off the stored region of C1 and the corresponding regions - // of Bh_pack and Ah_pack. We compute the width of the subpartition - // taking the location of the diagonal into account. + // Partition off the stored region of C1 and the corresponding region + // of Ah_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. offL = 0; nL = bl2_min( bl2_obj_width_after_trans( c1 ), bl2_obj_diag_offset_after_trans( c1 ) + b_alg ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offL, nL, &c1, &c1L ); - bl2_acquire_mpart_l2r( BLIS_SUBPART1, - offL, nL, &bh_pack, &bhL_pack ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offL, nL, &ah_pack, &ahL_pack ); - // Initialize objects for packing A1, B1, and C1. - bl2_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize objects for packing B1 and C1. bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) ); bl2_packm_init( &c1L, &c1L_pack, cntl_sub_packm_c( cntl ) ); - // Pack A1 and scale by alpha (if instructed). - bl2_packm_int( alpha, - &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - // Pack B1 and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, &b1, &b1_pack, @@ -138,16 +195,13 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, &c1L, &c1L_pack, cntl_sub_packm_c( cntl ) ); - // Perform her2k subproblem. - bl2_her2k_int( alpha, - &a1_pack, - &bhL_pack, - alpha_conj, - &b1_pack, - &ahL_pack, - beta, - &c1L_pack, - cntl_sub_her2k( cntl ) ); + // Perform herk subproblem. + bl2_herk_int( alpha_conj, + &b1_pack, + &ahL_pack, + &BLIS_ONE, + &c1L_pack, + cntl_sub_herk( cntl ) ); // Unpack C1 (if C1 was packed). bl2_unpackm_int( &c1L_pack, &c1L, @@ -156,8 +210,6 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, // If any packing buffers were acquired within packm, release them back // to the memory manager. - bl2_obj_release_pack( &a1_pack ); - bl2_obj_release_pack( &bh_pack ); bl2_obj_release_pack( &b1_pack ); bl2_obj_release_pack( &ah_pack ); bl2_obj_release_pack( &c1L_pack ); diff --git a/frame/3/her2k/bl2_her2k_u_blk_var1.c b/frame/3/her2k/bl2_her2k_u_blk_var1.c index ad106419b..dbfe59c71 100644 --- a/frame/3/her2k/bl2_her2k_u_blk_var1.c +++ b/frame/3/her2k/bl2_her2k_u_blk_var1.c @@ -71,17 +71,84 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, c, cntl_sub_scalm( cntl ) ); - // Initialize objects for packing B' and A'. + // + // Perform first rank-k update: C = C + alpha * A * B'. + // + + // Initialize object for packing B'. bl2_packm_init( bh, &bh_pack, cntl_sub_packm_b( cntl ) ); - bl2_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); // Pack B' and scale by alpha (if instructed). bl2_packm_int( alpha, bh, &bh_pack, cntl_sub_packm_b( cntl ) ); + // Partition along the m dimension. + for ( i = 0; i < m_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bl2_determine_blocksize_f( i, m_trans, a, + cntl_blocksize( cntl ) ); + + // Acquire partitions for A1 and C1. + bl2_acquire_mpart_t2b( BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bl2_acquire_mpart_t2b( BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Partition off the stored region of C1 and the corresponding region + // of Bh_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. + offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) ); + nR = bl2_obj_width_after_trans( c1 ) - offR; + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offR, nR, &c1, &c1R ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offR, nR, &bh_pack, &bhR_pack ); + + // Initialize objects for packing A1 and C1. + bl2_packm_init( &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + bl2_packm_init( &c1R, &c1R_pack, + cntl_sub_packm_c( cntl ) ); + + // Pack A1 and scale by alpha (if instructed). + bl2_packm_int( alpha, + &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + + // Pack C1 and scale by beta (if instructed). + bl2_packm_int( beta, + &c1R, &c1R_pack, + cntl_sub_packm_c( cntl ) ); + + // Perform herk subproblem. + bl2_herk_int( alpha, + &a1_pack, + &bhR_pack, + beta, + &c1R_pack, + cntl_sub_herk( cntl ) ); + + // Unpack C1 (if C1 was packed). + bl2_unpackm_int( &c1R_pack, &c1R, + cntl_sub_unpackm_c( cntl ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bl2_obj_release_pack( &a1_pack ); + bl2_obj_release_pack( &bh_pack ); + + // + // Perform second rank-k update: C = C + conj(alpha) * B * A'. + // + + // Initialize object for packing A'. + bl2_packm_init( ah, &ah_pack, + cntl_sub_packm_b( cntl ) ); + // Pack A' and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, ah, &ah_pack, @@ -91,42 +158,31 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bl2_determine_blocksize_f( i, m_trans, a, + b_alg = bl2_determine_blocksize_f( i, m_trans, b, cntl_blocksize( cntl ) ); - // Acquire partitions for A1, B1, and C1. - bl2_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); + // Acquire partitions for B1 and C1. bl2_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, b, &b1 ); bl2_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Partition off the stored region of C1 and the corresponding regions - // of Bh_pack and Ah_pack. We compute the width of the subpartition - // taking the location of the diagonal into account. + // Partition off the stored region of C1 and the corresponding region + // of Ah_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) ); nR = bl2_obj_width_after_trans( c1 ) - offR; bl2_acquire_mpart_l2r( BLIS_SUBPART1, offR, nR, &c1, &c1R ); - bl2_acquire_mpart_l2r( BLIS_SUBPART1, - offR, nR, &bh_pack, &bhR_pack ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offR, nR, &ah_pack, &ahR_pack ); - // Initialize objects for packing A1, B1, and C1. - bl2_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize objects for packing B1 and C1. bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) ); bl2_packm_init( &c1R, &c1R_pack, cntl_sub_packm_c( cntl ) ); - // Pack A1 and scale by alpha (if instructed). - bl2_packm_int( alpha, - &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - // Pack B1 and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, &b1, &b1_pack, @@ -137,16 +193,13 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, &c1R, &c1R_pack, cntl_sub_packm_c( cntl ) ); - // Perform her2k subproblem. - bl2_her2k_int( alpha, - &a1_pack, - &bhR_pack, - alpha_conj, - &b1_pack, - &ahR_pack, - beta, - &c1R_pack, - cntl_sub_her2k( cntl ) ); + // Perform herk subproblem. + bl2_herk_int( alpha_conj, + &b1_pack, + &ahR_pack, + &BLIS_ONE, + &c1R_pack, + cntl_sub_herk( cntl ) ); // Unpack C1 (if C1 was packed). bl2_unpackm_int( &c1R_pack, &c1R, @@ -155,8 +208,6 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, // If any packing buffers were acquired within packm, release them back // to the memory manager. - bl2_obj_release_pack( &a1_pack ); - bl2_obj_release_pack( &bh_pack ); bl2_obj_release_pack( &b1_pack ); bl2_obj_release_pack( &ah_pack ); bl2_obj_release_pack( &c1R_pack );